Initial commit
This commit is contained in:
@@ -0,0 +1,385 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
// N.B. we don't include async_generator.h as it's relatively heavy
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/file_reader.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ChunkedArray;
|
||||
class KeyValueMetadata;
|
||||
class RecordBatchReader;
|
||||
struct Scalar;
|
||||
class Schema;
|
||||
class Table;
|
||||
class RecordBatch;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class FileMetaData;
|
||||
class SchemaDescriptor;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ColumnChunkReader;
|
||||
class ColumnReader;
|
||||
struct SchemaManifest;
|
||||
class RowGroupReader;
|
||||
|
||||
/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
|
||||
///
|
||||
/// This interfaces caters for different use cases and thus provides different
|
||||
/// interfaces. In its most simplistic form, we cater for a user that wants to
|
||||
/// read the whole Parquet at once with the `FileReader::ReadTable` method.
|
||||
///
|
||||
/// More advanced users that also want to implement parallelism on top of each
|
||||
/// single Parquet files should do this on the RowGroup level. For this, they can
|
||||
/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
|
||||
/// RowGroup as a table.
|
||||
///
|
||||
/// In the most advanced situation, where a consumer wants to independently read
|
||||
/// RowGroups in parallel and consume each column individually, they can call
|
||||
/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
|
||||
/// instance.
|
||||
///
|
||||
/// Finally, one can also get a stream of record batches using
|
||||
/// `FileReader::GetRecordBatchReader()`. This can internally decode columns
|
||||
/// in parallel if use_threads was enabled in the ArrowReaderProperties.
|
||||
///
|
||||
/// The parquet format supports an optional integer field_id which can be assigned
|
||||
/// to a field. Arrow will convert these field IDs to a metadata key named
|
||||
/// PARQUET:field_id on the appropriate field.
|
||||
// TODO(wesm): nested data does not always make sense with this user
|
||||
// interface unless you are only reading a single leaf node from a branch of
|
||||
// a table. For example:
|
||||
//
|
||||
// repeated group data {
|
||||
// optional group record {
|
||||
// optional int32 val1;
|
||||
// optional byte_array val2;
|
||||
// optional bool val3;
|
||||
// }
|
||||
// optional int32 val4;
|
||||
// }
|
||||
//
|
||||
// In the Parquet file, there are 4 leaf nodes:
|
||||
//
|
||||
// * data.record.val1
|
||||
// * data.record.val2
|
||||
// * data.record.val3
|
||||
// * data.val4
|
||||
//
|
||||
// When materializing this data in an Arrow array, we would have:
|
||||
//
|
||||
// data: list<struct<
|
||||
// record: struct<
|
||||
// val1: int32,
|
||||
// val2: string (= list<uint8>),
|
||||
// val3: bool,
|
||||
// >,
|
||||
// val4: int32
|
||||
// >>
|
||||
//
|
||||
// However, in the Parquet format, each leaf node has its own repetition and
|
||||
// definition levels describing the structure of the intermediate nodes in
|
||||
// this array structure. Thus, we will need to scan the leaf data for a group
|
||||
// of leaf nodes part of the same type tree to create a single result Arrow
|
||||
// nested array structure.
|
||||
//
|
||||
// This is additionally complicated "chunky" repeated fields or very large byte
|
||||
// arrays
|
||||
class PARQUET_EXPORT FileReader {
|
||||
public:
|
||||
/// Factory function to create a FileReader from a ParquetFileReader and properties
|
||||
static ::arrow::Status Make(::arrow::MemoryPool* pool,
|
||||
std::unique_ptr<ParquetFileReader> reader,
|
||||
const ArrowReaderProperties& properties,
|
||||
std::unique_ptr<FileReader>* out);
|
||||
|
||||
/// Factory function to create a FileReader from a ParquetFileReader
|
||||
static ::arrow::Status Make(::arrow::MemoryPool* pool,
|
||||
std::unique_ptr<ParquetFileReader> reader,
|
||||
std::unique_ptr<FileReader>* out);
|
||||
|
||||
// Since the distribution of columns amongst a Parquet file's row groups may
|
||||
// be uneven (the number of values in each column chunk can be different), we
|
||||
// provide a column-oriented read interface. The ColumnReader hides the
|
||||
// details of paging through the file's row groups and yielding
|
||||
// fully-materialized arrow::Array instances
|
||||
//
|
||||
// Returns error status if the column of interest is not flat.
|
||||
// The indicated column index is relative to the schema
|
||||
virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
|
||||
|
||||
/// \brief Return arrow schema for all the columns.
|
||||
virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
|
||||
|
||||
/// \brief Read column as a whole into a chunked array.
|
||||
///
|
||||
/// The index i refers the index of the top level schema field, which may
|
||||
/// be nested or flat - e.g.
|
||||
///
|
||||
/// 0 foo.bar
|
||||
/// foo.bar.baz
|
||||
/// foo.qux
|
||||
/// 1 foo2
|
||||
/// 2 foo3
|
||||
///
|
||||
/// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
|
||||
virtual ::arrow::Status ReadColumn(int i,
|
||||
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of all row groups and columns.
|
||||
virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
|
||||
GetRecordBatchReader() = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices matters. FileReaders must outlive
|
||||
/// their RecordBatchReaders.
|
||||
///
|
||||
/// \returns error Result if row_group_indices contains an invalid index
|
||||
virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
|
||||
GetRecordBatchReader(const std::vector<int>& row_group_indices) = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from
|
||||
/// row_group_indices, whose columns are selected by column_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices and column_indices
|
||||
/// matter. FileReaders must outlive their RecordBatchReaders.
|
||||
///
|
||||
/// \returns error Result if either row_group_indices or column_indices
|
||||
/// contains an invalid index
|
||||
virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
|
||||
GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
const std::vector<int>& column_indices) = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from
|
||||
/// row_group_indices, whose columns are selected by column_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices and column_indices
|
||||
/// matter. FileReaders must outlive their RecordBatchReaders.
|
||||
///
|
||||
/// \param row_group_indices which row groups to read (order determines read order).
|
||||
/// \param column_indices which columns to read (order determines output schema).
|
||||
/// \param[out] out record batch stream from parquet data.
|
||||
///
|
||||
/// \returns error Status if either row_group_indices or column_indices
|
||||
/// contains an invalid index
|
||||
/// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
|
||||
ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
|
||||
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
|
||||
ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
|
||||
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
|
||||
ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
|
||||
::arrow::Status GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// \brief Return a generator of record batches.
|
||||
///
|
||||
/// The FileReader must outlive the generator, so this requires that you pass in a
|
||||
/// shared_ptr.
|
||||
///
|
||||
/// \returns error Result if either row_group_indices or column_indices contains an
|
||||
/// invalid index
|
||||
virtual ::arrow::Result<
|
||||
std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
|
||||
GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
|
||||
const std::vector<int> row_group_indices,
|
||||
const std::vector<int> column_indices,
|
||||
::arrow::internal::Executor* cpu_executor = NULLPTR,
|
||||
int64_t rows_to_readahead = 0) = 0;
|
||||
|
||||
/// Read all columns into a Table
|
||||
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
/// \brief Read the given columns into a Table
|
||||
///
|
||||
/// The indicated column indices are relative to the internal representation
|
||||
/// of the parquet table. For instance :
|
||||
/// 0 foo.bar
|
||||
/// foo.bar.baz 0
|
||||
/// foo.bar.baz2 1
|
||||
/// foo.qux 2
|
||||
/// 1 foo2 3
|
||||
/// 2 foo3 4
|
||||
///
|
||||
/// i=0 will read foo.bar.baz, i=1 will read only foo.bar.baz2 and so on.
|
||||
/// Only leaf fields have indices; foo itself doesn't have an index.
|
||||
/// To get the index for a particular leaf field, one can use
|
||||
/// manifest().schema_fields to get the top level fields, and then walk the
|
||||
/// tree to identify the relevant leaf fields and access its column_index.
|
||||
/// To get the total number of leaf fields, use FileMetadata.num_columns().
|
||||
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
|
||||
const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
/// \brief Scan file contents with one thread, return number of rows
|
||||
virtual ::arrow::Status ScanContents(std::vector<int> columns,
|
||||
const int32_t column_batch_size,
|
||||
int64_t* num_rows) = 0;
|
||||
|
||||
/// \brief Return a reader for the RowGroup, this object must not outlive the
|
||||
/// FileReader.
|
||||
virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
|
||||
|
||||
/// \brief The number of row groups in the file
|
||||
virtual int num_row_groups() const = 0;
|
||||
|
||||
virtual ParquetFileReader* parquet_reader() const = 0;
|
||||
|
||||
/// Set whether to use multiple threads during reads of multiple columns.
|
||||
/// By default only one thread is used.
|
||||
virtual void set_use_threads(bool use_threads) = 0;
|
||||
|
||||
/// Set number of records to read per batch for the RecordBatchReader.
|
||||
virtual void set_batch_size(int64_t batch_size) = 0;
|
||||
|
||||
virtual const ArrowReaderProperties& properties() const = 0;
|
||||
|
||||
virtual const SchemaManifest& manifest() const = 0;
|
||||
|
||||
virtual ~FileReader() = default;
|
||||
};
|
||||
|
||||
class RowGroupReader {
|
||||
public:
|
||||
virtual ~RowGroupReader() = default;
|
||||
virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
|
||||
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
private:
|
||||
struct Iterator;
|
||||
};
|
||||
|
||||
class ColumnChunkReader {
|
||||
public:
|
||||
virtual ~ColumnChunkReader() = default;
|
||||
virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
};
|
||||
|
||||
// At this point, the column reader is a stream iterator. It only knows how to
|
||||
// read the next batch of values for a particular column from the file until it
|
||||
// runs out.
|
||||
//
|
||||
// We also do not expose any internal Parquet details, such as row groups. This
|
||||
// might change in the future.
|
||||
class PARQUET_EXPORT ColumnReader {
|
||||
public:
|
||||
virtual ~ColumnReader() = default;
|
||||
|
||||
// Scan the next array of the indicated size. The actual size of the
|
||||
// returned array may be less than the passed size depending how much data is
|
||||
// available in the file.
|
||||
//
|
||||
// When all the data in the file has been exhausted, the result is set to
|
||||
// nullptr.
|
||||
//
|
||||
// Returns Status::OK on a successful read, including if you have exhausted
|
||||
// the data available in the file.
|
||||
virtual ::arrow::Status NextBatch(int64_t batch_size,
|
||||
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
};
|
||||
|
||||
/// \brief Experimental helper class for bindings (like Python) that struggle
|
||||
/// either with std::move or C++ exceptions
|
||||
class PARQUET_EXPORT FileReaderBuilder {
|
||||
public:
|
||||
FileReaderBuilder();
|
||||
|
||||
/// Create FileReaderBuilder from Arrow file and optional properties / metadata
|
||||
::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
|
||||
const ReaderProperties& properties = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
/// Create FileReaderBuilder from file path and optional properties / metadata
|
||||
::arrow::Status OpenFile(const std::string& path, bool memory_map = false,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
ParquetFileReader* raw_reader() { return raw_reader_.get(); }
|
||||
|
||||
/// Set Arrow MemoryPool for memory allocation
|
||||
FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
|
||||
/// Set Arrow reader properties
|
||||
FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
|
||||
/// Build FileReader instance
|
||||
::arrow::Status Build(std::unique_ptr<FileReader>* out);
|
||||
::arrow::Result<std::unique_ptr<FileReader>> Build();
|
||||
|
||||
private:
|
||||
::arrow::MemoryPool* pool_;
|
||||
ArrowReaderProperties properties_;
|
||||
std::unique_ptr<ParquetFileReader> raw_reader_;
|
||||
};
|
||||
|
||||
/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Build FileReader from Arrow file and MemoryPool
|
||||
///
|
||||
/// Advanced settings are supported through the FileReaderBuilder class.
|
||||
PARQUET_EXPORT
|
||||
::arrow::Result<std::unique_ptr<FileReader>> OpenFile(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator);
|
||||
|
||||
/// @}
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
|
||||
std::shared_ptr<::arrow::Scalar>* min,
|
||||
std::shared_ptr<::arrow::Scalar>* max);
|
||||
|
||||
namespace internal {
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,184 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
#include "parquet/level_conversion.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ArrowReaderProperties;
|
||||
class ArrowWriterProperties;
|
||||
class WriterProperties;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
|
||||
/// schema into a Parquet schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
schema::NodePtr* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
|
||||
/// schema into an Arrow schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(
|
||||
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
const ArrowReaderProperties& properties,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Bridge between an arrow::Field and parquet column indices.
|
||||
struct PARQUET_EXPORT SchemaField {
|
||||
std::shared_ptr<::arrow::Field> field;
|
||||
std::vector<SchemaField> children;
|
||||
|
||||
// Only set for leaf nodes
|
||||
int column_index = -1;
|
||||
|
||||
parquet::internal::LevelInfo level_info;
|
||||
|
||||
bool is_leaf() const { return column_index != -1; }
|
||||
};
|
||||
|
||||
/// \brief Bridge between a parquet Schema and an arrow Schema.
|
||||
///
|
||||
/// Expose parquet columns as a tree structure. Useful traverse and link
|
||||
/// between arrow's Schema and parquet's Schema.
|
||||
struct PARQUET_EXPORT SchemaManifest {
|
||||
static ::arrow::Status Make(
|
||||
const SchemaDescriptor* schema,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
|
||||
const ArrowReaderProperties& properties, SchemaManifest* manifest);
|
||||
|
||||
const SchemaDescriptor* descr;
|
||||
std::shared_ptr<::arrow::Schema> origin_schema;
|
||||
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
|
||||
std::vector<SchemaField> schema_fields;
|
||||
|
||||
std::unordered_map<int, const SchemaField*> column_index_to_field;
|
||||
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
|
||||
|
||||
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
|
||||
auto it = column_index_to_field.find(column_index);
|
||||
if (it == column_index_to_field.end()) {
|
||||
return ::arrow::Status::KeyError("Column index ", column_index,
|
||||
" not found in schema manifest, may be malformed");
|
||||
}
|
||||
*out = it->second;
|
||||
return ::arrow::Status::OK();
|
||||
}
|
||||
|
||||
const SchemaField* GetParent(const SchemaField* field) const {
|
||||
// Returns nullptr also if not found
|
||||
auto it = child_to_parent.find(field);
|
||||
if (it == child_to_parent.end()) {
|
||||
return NULLPTR;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
/// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
|
||||
/// correspond to the column root (first node below the parquet schema's root group) of
|
||||
/// each leaf referenced in column_indices.
|
||||
///
|
||||
/// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
|
||||
/// the roots are `a` and `i` (return=[0,2]).
|
||||
///
|
||||
/// root
|
||||
/// -- a <------
|
||||
/// -- -- b | |
|
||||
/// -- -- -- c |
|
||||
/// -- -- -- d |
|
||||
/// -- -- -- -- e
|
||||
/// -- f
|
||||
/// -- -- g
|
||||
/// -- -- -- h
|
||||
/// -- i <---
|
||||
/// -- -- j |
|
||||
/// -- -- -- k
|
||||
::arrow::Result<std::vector<int>> GetFieldIndices(
|
||||
const std::vector<int>& column_indices) const {
|
||||
const schema::GroupNode* group = descr->group_node();
|
||||
std::unordered_set<int> already_added;
|
||||
|
||||
std::vector<int> out;
|
||||
for (int column_idx : column_indices) {
|
||||
if (column_idx < 0 || column_idx >= descr->num_columns()) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
auto field_node = descr->GetColumnRoot(column_idx);
|
||||
auto field_idx = group->FieldIndex(*field_node);
|
||||
if (field_idx == -1) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
if (already_added.insert(field_idx).second) {
|
||||
out.push_back(field_idx);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,487 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/array/builder_decimal.h"
|
||||
#include "arrow/array/builder_primitive.h"
|
||||
#include "arrow/testing/gtest_util.h"
|
||||
#include "arrow/testing/random.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/float16.h"
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/test_util.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
using internal::RecordReader;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using ::arrow::Array;
|
||||
using ::arrow::ChunkedArray;
|
||||
using ::arrow::Status;
|
||||
|
||||
template <typename T, int32_t PRECISION, typename = ::arrow::enable_if_decimal<T>>
|
||||
struct DecimalWithPrecisionAndScale {
|
||||
using type = T;
|
||||
static_assert(PRECISION >= T::kMinPrecision && PRECISION <= T::kMaxPrecision,
|
||||
"Invalid precision value");
|
||||
static constexpr ::arrow::Type::type type_id = T::type_id;
|
||||
static constexpr int32_t precision = PRECISION;
|
||||
static constexpr int32_t scale = PRECISION - 1;
|
||||
};
|
||||
template <int32_t PRECISION>
|
||||
using Decimal32WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal32Type, PRECISION>;
|
||||
template <int32_t PRECISION>
|
||||
using Decimal64WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal64Type, PRECISION>;
|
||||
template <int32_t PRECISION>
|
||||
using Decimal128WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal128Type, PRECISION>;
|
||||
template <int32_t PRECISION>
|
||||
using Decimal256WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal256Type, PRECISION>;
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_floating_point<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using c_type = typename ArrowType::c_type;
|
||||
std::vector<c_type> values;
|
||||
if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
|
||||
values.resize(size);
|
||||
test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(0.0f),
|
||||
::arrow::util::Float16(1.0f), values.data());
|
||||
} else {
|
||||
::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1),
|
||||
&values);
|
||||
}
|
||||
::arrow::NumericBuilder<ArrowType> builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_integer<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
::arrow::randint(size, 0, 64, &values);
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_date<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
::arrow::randint(size, 0, 24, &values);
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
values[i] *= 86400000;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_base_binary<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
BuilderType builder;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
RETURN_NOT_OK(builder.Append("test-string"));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
// set byte_width to the length of "fixed": 5
|
||||
// todo: find a way to generate test data with more diversity.
|
||||
BuilderType builder(::arrow::fixed_size_binary(5));
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
RETURN_NOT_OK(builder.Append("fixed"));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <int32_t byte_width>
|
||||
static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
|
||||
auto gen = ::arrow::random::RandomArrayGenerator(seed);
|
||||
std::shared_ptr<Array> decimals;
|
||||
if constexpr (byte_width == 4) {
|
||||
decimals = gen.Decimal32(::arrow::decimal32(precision, 0), n);
|
||||
} else if constexpr (byte_width == 8) {
|
||||
decimals = gen.Decimal64(::arrow::decimal64(precision, 0), n);
|
||||
} else if constexpr (byte_width == 16) {
|
||||
decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
|
||||
} else {
|
||||
decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
|
||||
}
|
||||
std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
|
||||
typename ArrowType::type, precision>>,
|
||||
Status>
|
||||
NonNullArray(size_t size, std::shared_ptr<Array>* out) {
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = ArrowType::scale;
|
||||
|
||||
const auto type =
|
||||
std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
|
||||
const int32_t byte_width = type->byte_width();
|
||||
|
||||
constexpr int32_t seed = 0;
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
random_decimals<ArrowType::type::kByteWidth>(size, seed, kDecimalPrecision,
|
||||
out_buf->mutable_data());
|
||||
|
||||
using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
|
||||
Builder builder(type);
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_boolean<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<uint8_t> values;
|
||||
::arrow::randint(size, 0, 1, &values);
|
||||
::arrow::BooleanBuilder builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
|
||||
using c_type = typename ArrowType::c_type;
|
||||
std::vector<c_type> values;
|
||||
if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
|
||||
values.resize(size);
|
||||
test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(-1e4f),
|
||||
::arrow::util::Float16(1e4f), values.data());
|
||||
} else {
|
||||
::arrow::random_real(size, seed, static_cast<c_type>(-1e10),
|
||||
static_cast<c_type>(1e10), &values);
|
||||
}
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
::arrow::NumericBuilder<ArrowType> builder;
|
||||
if (values.size() > 0) {
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_integer<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
::arrow::randint(size, 0, 64, &values);
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_date<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
::arrow::randint(size, 0, 24, &values);
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
values[i] *= 86400000;
|
||||
}
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_base_binary<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
BuilderType builder;
|
||||
|
||||
const int kBufferSize = 10;
|
||||
uint8_t buffer[kBufferSize];
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (!valid_bytes[i]) {
|
||||
RETURN_NOT_OK(builder.AppendNull());
|
||||
} else {
|
||||
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
|
||||
if (ArrowType::is_utf8) {
|
||||
// Trivially force data to be valid UTF8 by making it all ASCII
|
||||
for (auto& byte : buffer) {
|
||||
byte &= 0x7f;
|
||||
}
|
||||
}
|
||||
RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
|
||||
}
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet,
|
||||
// same as NullableArray<String|Binary>(..)
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
const int byte_width = 10;
|
||||
BuilderType builder(::arrow::fixed_size_binary(byte_width));
|
||||
|
||||
const int kBufferSize = byte_width;
|
||||
uint8_t buffer[kBufferSize];
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (!valid_bytes[i]) {
|
||||
RETURN_NOT_OK(builder.AppendNull());
|
||||
} else {
|
||||
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
|
||||
RETURN_NOT_OK(builder.Append(buffer));
|
||||
}
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
|
||||
typename ArrowType::type, precision>>,
|
||||
Status>
|
||||
NullableArray(size_t size, size_t num_nulls, uint32_t seed,
|
||||
std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, '\1');
|
||||
|
||||
for (size_t i = 0; i < num_nulls; ++i) {
|
||||
valid_bytes[i * 2] = '\0';
|
||||
}
|
||||
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = ArrowType::scale;
|
||||
|
||||
const auto type =
|
||||
std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
|
||||
const int32_t byte_width = type->byte_width();
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
random_decimals<ArrowType::type::kByteWidth>(size, seed, precision,
|
||||
out_buf->mutable_data());
|
||||
|
||||
using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
|
||||
Builder builder(type);
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet.
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_boolean<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<uint8_t> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
|
||||
::arrow::randint(size, 0, 1, &values);
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
::arrow::BooleanBuilder builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
/// Wrap an Array into a ListArray by splitting it up into size lists.
|
||||
///
|
||||
/// This helper function only supports (size/2) nulls.
|
||||
Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
|
||||
int64_t null_count, const std::string& item_name,
|
||||
bool nullable_values, std::shared_ptr<::arrow::ListArray>* out) {
|
||||
// We always include an empty list
|
||||
int64_t non_null_entries = size - null_count - 1;
|
||||
int64_t length_per_entry = values->length() / non_null_entries;
|
||||
|
||||
auto offsets = AllocateBuffer();
|
||||
RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
|
||||
int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
|
||||
|
||||
auto null_bitmap = AllocateBuffer();
|
||||
int64_t bitmap_size = ::arrow::bit_util::BytesForBits(size);
|
||||
RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
|
||||
uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
|
||||
memset(null_bitmap_ptr, 0, bitmap_size);
|
||||
|
||||
int32_t current_offset = 0;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
offsets_ptr[i] = current_offset;
|
||||
if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
|
||||
// Non-null list (list with index 1 is always empty).
|
||||
::arrow::bit_util::SetBit(null_bitmap_ptr, i);
|
||||
if (i != 1) {
|
||||
current_offset += static_cast<int32_t>(length_per_entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
offsets_ptr[size] = static_cast<int32_t>(values->length());
|
||||
|
||||
auto value_field = ::arrow::field(item_name, values->type(), nullable_values);
|
||||
*out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
|
||||
values, null_bitmap, null_count);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Make an array containing only empty lists, with a null values array
|
||||
Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
|
||||
// Allocate an offsets buffer containing only zeroes
|
||||
const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
|
||||
ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, ::arrow::AllocateBuffer(offsets_nbytes));
|
||||
memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
|
||||
|
||||
auto value_field =
|
||||
::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
|
||||
auto list_type = ::arrow::list(value_field);
|
||||
|
||||
std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
|
||||
nullptr /* values */};
|
||||
auto child_data =
|
||||
::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
|
||||
|
||||
std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
|
||||
std::move(offsets_buffer)};
|
||||
auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
|
||||
array_data->child_data.push_back(child_data);
|
||||
|
||||
*out_array = ::arrow::MakeArray(array_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<::arrow::Table> MakeSimpleTable(
|
||||
const std::shared_ptr<ChunkedArray>& values, bool nullable) {
|
||||
auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)});
|
||||
return ::arrow::Table::Make(schema, {values});
|
||||
}
|
||||
|
||||
std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
|
||||
bool nullable) {
|
||||
auto carr = std::make_shared<::arrow::ChunkedArray>(values);
|
||||
return MakeSimpleTable(carr, nullable);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ExpectArray(T* expected, Array* result) {
|
||||
auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
|
||||
for (int i = 0; i < result->length(); i++) {
|
||||
EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
void ExpectArrayT(void* expected, Array* result) {
|
||||
::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
|
||||
for (int64_t i = 0; i < result->length(); i++) {
|
||||
EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
|
||||
reinterpret_cast<const typename ArrowType::c_type*>(
|
||||
p_array->values()->data())[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
|
||||
::arrow::BooleanBuilder builder;
|
||||
ARROW_EXPECT_OK(
|
||||
builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
|
||||
|
||||
std::shared_ptr<Array> expected_array;
|
||||
ARROW_EXPECT_OK(builder.Finish(&expected_array));
|
||||
EXPECT_TRUE(result->Equals(*expected_array));
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,176 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class ChunkedArray;
|
||||
class RecordBatch;
|
||||
class Schema;
|
||||
class Table;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class FileMetaData;
|
||||
class ParquetFileWriter;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Iterative FileWriter class
|
||||
///
|
||||
/// For basic usage, can write a Table at a time, creating one or more row
|
||||
/// groups per write call.
|
||||
///
|
||||
/// For advanced usage, can write column-by-column: Start a new RowGroup or
|
||||
/// Chunk with NewRowGroup, then write column-by-column the whole column chunk.
|
||||
///
|
||||
/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
|
||||
/// value is a nonnegative integer, then it will be used as the field_id in the parquet
|
||||
/// file.
|
||||
class PARQUET_EXPORT FileWriter {
|
||||
public:
|
||||
static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
|
||||
std::shared_ptr<::arrow::Schema> schema,
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties,
|
||||
std::unique_ptr<FileWriter>* out);
|
||||
|
||||
/// \brief Try to create an Arrow to Parquet file writer.
|
||||
///
|
||||
/// \param schema schema of data that will be passed.
|
||||
/// \param pool memory pool to use.
|
||||
/// \param sink output stream to write Parquet data.
|
||||
/// \param properties general Parquet writer properties.
|
||||
/// \param arrow_properties Arrow-specific writer properties.
|
||||
///
|
||||
/// \since 11.0.0
|
||||
static ::arrow::Result<std::unique_ptr<FileWriter>> Open(
|
||||
const ::arrow::Schema& schema, MemoryPool* pool,
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties =
|
||||
default_arrow_writer_properties());
|
||||
|
||||
/// Return the Arrow schema to be written to.
|
||||
virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
|
||||
|
||||
/// \brief Write a Table to Parquet.
|
||||
///
|
||||
/// \param table Arrow table to write.
|
||||
/// \param chunk_size maximum number of rows to write per row group.
|
||||
virtual ::arrow::Status WriteTable(
|
||||
const ::arrow::Table& table, int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH) = 0;
|
||||
|
||||
/// \brief Start a new row group.
|
||||
///
|
||||
/// Returns an error if not all columns have been written.
|
||||
virtual ::arrow::Status NewRowGroup() = 0;
|
||||
|
||||
/// \brief Write ColumnChunk in row group using an array.
|
||||
virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
|
||||
|
||||
/// \brief Write ColumnChunk in row group using slice of a ChunkedArray
|
||||
virtual ::arrow::Status WriteColumnChunk(
|
||||
const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
|
||||
int64_t size) = 0;
|
||||
|
||||
/// \brief Write ColumnChunk in a row group using a ChunkedArray
|
||||
virtual ::arrow::Status WriteColumnChunk(
|
||||
const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
|
||||
|
||||
/// \brief Start a new buffered row group.
|
||||
///
|
||||
/// Returns an error if not all columns have been written.
|
||||
virtual ::arrow::Status NewBufferedRowGroup() = 0;
|
||||
|
||||
/// \brief Write a RecordBatch into the buffered row group.
|
||||
///
|
||||
/// Multiple RecordBatches can be written into the same row group
|
||||
/// through this method.
|
||||
///
|
||||
/// WriterProperties.max_row_group_length() is respected and a new
|
||||
/// row group will be created if the current row group exceeds the
|
||||
/// limit.
|
||||
///
|
||||
/// Batches get flushed to the output stream once NewBufferedRowGroup()
|
||||
/// or Close() is called.
|
||||
///
|
||||
/// WARNING: If you are writing multiple files in parallel in the same
|
||||
/// executor, deadlock may occur if ArrowWriterProperties::use_threads
|
||||
/// is set to true to write columns in parallel. Please disable use_threads
|
||||
/// option in this case.
|
||||
virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch) = 0;
|
||||
|
||||
/// \brief Write the footer and close the file.
|
||||
virtual ::arrow::Status Close() = 0;
|
||||
virtual ~FileWriter();
|
||||
|
||||
virtual MemoryPool* memory_pool() const = 0;
|
||||
/// \brief Add key-value metadata to the file.
|
||||
/// \param[in] key_value_metadata the metadata to add.
|
||||
/// \note This will overwrite any existing metadata with the same key.
|
||||
/// \return Error if Close() has been called.
|
||||
///
|
||||
/// WARNING: If `store_schema` is enabled, `ARROW:schema` would be stored
|
||||
/// in the key-value metadata. Overwriting this key would result in
|
||||
/// `store_schema` being unusable during read.
|
||||
virtual ::arrow::Status AddKeyValueMetadata(
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
|
||||
/// \brief Return the file metadata, only available after calling Close().
|
||||
virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
/// \brief Write a Table to Parquet.
|
||||
///
|
||||
/// This writes one table in a single shot. To write a Parquet file with
|
||||
/// multiple tables iteratively, see parquet::arrow::FileWriter.
|
||||
///
|
||||
/// \param table Table to write.
|
||||
/// \param pool memory pool to use.
|
||||
/// \param sink output stream to write Parquet data.
|
||||
/// \param chunk_size maximum number of rows to write per row group.
|
||||
/// \param properties general Parquet writer properties.
|
||||
/// \param arrow_properties Arrow-specific writer properties.
|
||||
::arrow::Status PARQUET_EXPORT
|
||||
WriteTable(const ::arrow::Table& table, MemoryPool* pool,
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH,
|
||||
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties =
|
||||
default_arrow_writer_properties());
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
Reference in New Issue
Block a user