Initial commit
This commit is contained in:
@@ -0,0 +1,20 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parquet/exception.h"
|
||||
@@ -0,0 +1,35 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
// Column reader API
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/column_scanner.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/file_reader.h"
|
||||
#include "parquet/metadata.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/printer.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/statistics.h"
|
||||
|
||||
// Schemas
|
||||
#include "parquet/api/schema.h"
|
||||
|
||||
// IO
|
||||
#include "parquet/api/io.h"
|
||||
@@ -0,0 +1,21 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
// Schemas
|
||||
#include "parquet/schema.h"
|
||||
@@ -0,0 +1,25 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parquet/api/io.h"
|
||||
#include "parquet/api/schema.h"
|
||||
#include "parquet/column_writer.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/file_writer.h"
|
||||
#include "parquet/statistics.h"
|
||||
@@ -0,0 +1,385 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
// N.B. we don't include async_generator.h as it's relatively heavy
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/file_reader.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ChunkedArray;
|
||||
class KeyValueMetadata;
|
||||
class RecordBatchReader;
|
||||
struct Scalar;
|
||||
class Schema;
|
||||
class Table;
|
||||
class RecordBatch;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class FileMetaData;
|
||||
class SchemaDescriptor;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ColumnChunkReader;
|
||||
class ColumnReader;
|
||||
struct SchemaManifest;
|
||||
class RowGroupReader;
|
||||
|
||||
/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
|
||||
///
|
||||
/// This interfaces caters for different use cases and thus provides different
|
||||
/// interfaces. In its most simplistic form, we cater for a user that wants to
|
||||
/// read the whole Parquet at once with the `FileReader::ReadTable` method.
|
||||
///
|
||||
/// More advanced users that also want to implement parallelism on top of each
|
||||
/// single Parquet files should do this on the RowGroup level. For this, they can
|
||||
/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
|
||||
/// RowGroup as a table.
|
||||
///
|
||||
/// In the most advanced situation, where a consumer wants to independently read
|
||||
/// RowGroups in parallel and consume each column individually, they can call
|
||||
/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
|
||||
/// instance.
|
||||
///
|
||||
/// Finally, one can also get a stream of record batches using
|
||||
/// `FileReader::GetRecordBatchReader()`. This can internally decode columns
|
||||
/// in parallel if use_threads was enabled in the ArrowReaderProperties.
|
||||
///
|
||||
/// The parquet format supports an optional integer field_id which can be assigned
|
||||
/// to a field. Arrow will convert these field IDs to a metadata key named
|
||||
/// PARQUET:field_id on the appropriate field.
|
||||
// TODO(wesm): nested data does not always make sense with this user
|
||||
// interface unless you are only reading a single leaf node from a branch of
|
||||
// a table. For example:
|
||||
//
|
||||
// repeated group data {
|
||||
// optional group record {
|
||||
// optional int32 val1;
|
||||
// optional byte_array val2;
|
||||
// optional bool val3;
|
||||
// }
|
||||
// optional int32 val4;
|
||||
// }
|
||||
//
|
||||
// In the Parquet file, there are 4 leaf nodes:
|
||||
//
|
||||
// * data.record.val1
|
||||
// * data.record.val2
|
||||
// * data.record.val3
|
||||
// * data.val4
|
||||
//
|
||||
// When materializing this data in an Arrow array, we would have:
|
||||
//
|
||||
// data: list<struct<
|
||||
// record: struct<
|
||||
// val1: int32,
|
||||
// val2: string (= list<uint8>),
|
||||
// val3: bool,
|
||||
// >,
|
||||
// val4: int32
|
||||
// >>
|
||||
//
|
||||
// However, in the Parquet format, each leaf node has its own repetition and
|
||||
// definition levels describing the structure of the intermediate nodes in
|
||||
// this array structure. Thus, we will need to scan the leaf data for a group
|
||||
// of leaf nodes part of the same type tree to create a single result Arrow
|
||||
// nested array structure.
|
||||
//
|
||||
// This is additionally complicated "chunky" repeated fields or very large byte
|
||||
// arrays
|
||||
class PARQUET_EXPORT FileReader {
|
||||
public:
|
||||
/// Factory function to create a FileReader from a ParquetFileReader and properties
|
||||
static ::arrow::Status Make(::arrow::MemoryPool* pool,
|
||||
std::unique_ptr<ParquetFileReader> reader,
|
||||
const ArrowReaderProperties& properties,
|
||||
std::unique_ptr<FileReader>* out);
|
||||
|
||||
/// Factory function to create a FileReader from a ParquetFileReader
|
||||
static ::arrow::Status Make(::arrow::MemoryPool* pool,
|
||||
std::unique_ptr<ParquetFileReader> reader,
|
||||
std::unique_ptr<FileReader>* out);
|
||||
|
||||
// Since the distribution of columns amongst a Parquet file's row groups may
|
||||
// be uneven (the number of values in each column chunk can be different), we
|
||||
// provide a column-oriented read interface. The ColumnReader hides the
|
||||
// details of paging through the file's row groups and yielding
|
||||
// fully-materialized arrow::Array instances
|
||||
//
|
||||
// Returns error status if the column of interest is not flat.
|
||||
// The indicated column index is relative to the schema
|
||||
virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
|
||||
|
||||
/// \brief Return arrow schema for all the columns.
|
||||
virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
|
||||
|
||||
/// \brief Read column as a whole into a chunked array.
|
||||
///
|
||||
/// The index i refers the index of the top level schema field, which may
|
||||
/// be nested or flat - e.g.
|
||||
///
|
||||
/// 0 foo.bar
|
||||
/// foo.bar.baz
|
||||
/// foo.qux
|
||||
/// 1 foo2
|
||||
/// 2 foo3
|
||||
///
|
||||
/// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
|
||||
virtual ::arrow::Status ReadColumn(int i,
|
||||
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of all row groups and columns.
|
||||
virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
|
||||
GetRecordBatchReader() = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices matters. FileReaders must outlive
|
||||
/// their RecordBatchReaders.
|
||||
///
|
||||
/// \returns error Result if row_group_indices contains an invalid index
|
||||
virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
|
||||
GetRecordBatchReader(const std::vector<int>& row_group_indices) = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from
|
||||
/// row_group_indices, whose columns are selected by column_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices and column_indices
|
||||
/// matter. FileReaders must outlive their RecordBatchReaders.
|
||||
///
|
||||
/// \returns error Result if either row_group_indices or column_indices
|
||||
/// contains an invalid index
|
||||
virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
|
||||
GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
const std::vector<int>& column_indices) = 0;
|
||||
|
||||
/// \brief Return a RecordBatchReader of row groups selected from
|
||||
/// row_group_indices, whose columns are selected by column_indices.
|
||||
///
|
||||
/// Note that the ordering in row_group_indices and column_indices
|
||||
/// matter. FileReaders must outlive their RecordBatchReaders.
|
||||
///
|
||||
/// \param row_group_indices which row groups to read (order determines read order).
|
||||
/// \param column_indices which columns to read (order determines output schema).
|
||||
/// \param[out] out record batch stream from parquet data.
|
||||
///
|
||||
/// \returns error Status if either row_group_indices or column_indices
|
||||
/// contains an invalid index
|
||||
/// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
|
||||
ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
|
||||
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
|
||||
ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
|
||||
::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
|
||||
std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
|
||||
ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
|
||||
::arrow::Status GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out);
|
||||
|
||||
/// \brief Return a generator of record batches.
|
||||
///
|
||||
/// The FileReader must outlive the generator, so this requires that you pass in a
|
||||
/// shared_ptr.
|
||||
///
|
||||
/// \returns error Result if either row_group_indices or column_indices contains an
|
||||
/// invalid index
|
||||
virtual ::arrow::Result<
|
||||
std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
|
||||
GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
|
||||
const std::vector<int> row_group_indices,
|
||||
const std::vector<int> column_indices,
|
||||
::arrow::internal::Executor* cpu_executor = NULLPTR,
|
||||
int64_t rows_to_readahead = 0) = 0;
|
||||
|
||||
/// Read all columns into a Table
|
||||
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
/// \brief Read the given columns into a Table
|
||||
///
|
||||
/// The indicated column indices are relative to the internal representation
|
||||
/// of the parquet table. For instance :
|
||||
/// 0 foo.bar
|
||||
/// foo.bar.baz 0
|
||||
/// foo.bar.baz2 1
|
||||
/// foo.qux 2
|
||||
/// 1 foo2 3
|
||||
/// 2 foo3 4
|
||||
///
|
||||
/// i=0 will read foo.bar.baz, i=1 will read only foo.bar.baz2 and so on.
|
||||
/// Only leaf fields have indices; foo itself doesn't have an index.
|
||||
/// To get the index for a particular leaf field, one can use
|
||||
/// manifest().schema_fields to get the top level fields, and then walk the
|
||||
/// tree to identify the relevant leaf fields and access its column_index.
|
||||
/// To get the total number of leaf fields, use FileMetadata.num_columns().
|
||||
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
|
||||
const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
/// \brief Scan file contents with one thread, return number of rows
|
||||
virtual ::arrow::Status ScanContents(std::vector<int> columns,
|
||||
const int32_t column_batch_size,
|
||||
int64_t* num_rows) = 0;
|
||||
|
||||
/// \brief Return a reader for the RowGroup, this object must not outlive the
|
||||
/// FileReader.
|
||||
virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
|
||||
|
||||
/// \brief The number of row groups in the file
|
||||
virtual int num_row_groups() const = 0;
|
||||
|
||||
virtual ParquetFileReader* parquet_reader() const = 0;
|
||||
|
||||
/// Set whether to use multiple threads during reads of multiple columns.
|
||||
/// By default only one thread is used.
|
||||
virtual void set_use_threads(bool use_threads) = 0;
|
||||
|
||||
/// Set number of records to read per batch for the RecordBatchReader.
|
||||
virtual void set_batch_size(int64_t batch_size) = 0;
|
||||
|
||||
virtual const ArrowReaderProperties& properties() const = 0;
|
||||
|
||||
virtual const SchemaManifest& manifest() const = 0;
|
||||
|
||||
virtual ~FileReader() = default;
|
||||
};
|
||||
|
||||
class RowGroupReader {
|
||||
public:
|
||||
virtual ~RowGroupReader() = default;
|
||||
virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
|
||||
virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
|
||||
std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
|
||||
|
||||
private:
|
||||
struct Iterator;
|
||||
};
|
||||
|
||||
class ColumnChunkReader {
|
||||
public:
|
||||
virtual ~ColumnChunkReader() = default;
|
||||
virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
};
|
||||
|
||||
// At this point, the column reader is a stream iterator. It only knows how to
|
||||
// read the next batch of values for a particular column from the file until it
|
||||
// runs out.
|
||||
//
|
||||
// We also do not expose any internal Parquet details, such as row groups. This
|
||||
// might change in the future.
|
||||
class PARQUET_EXPORT ColumnReader {
|
||||
public:
|
||||
virtual ~ColumnReader() = default;
|
||||
|
||||
// Scan the next array of the indicated size. The actual size of the
|
||||
// returned array may be less than the passed size depending how much data is
|
||||
// available in the file.
|
||||
//
|
||||
// When all the data in the file has been exhausted, the result is set to
|
||||
// nullptr.
|
||||
//
|
||||
// Returns Status::OK on a successful read, including if you have exhausted
|
||||
// the data available in the file.
|
||||
virtual ::arrow::Status NextBatch(int64_t batch_size,
|
||||
std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
|
||||
};
|
||||
|
||||
/// \brief Experimental helper class for bindings (like Python) that struggle
|
||||
/// either with std::move or C++ exceptions
|
||||
class PARQUET_EXPORT FileReaderBuilder {
|
||||
public:
|
||||
FileReaderBuilder();
|
||||
|
||||
/// Create FileReaderBuilder from Arrow file and optional properties / metadata
|
||||
::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
|
||||
const ReaderProperties& properties = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
/// Create FileReaderBuilder from file path and optional properties / metadata
|
||||
::arrow::Status OpenFile(const std::string& path, bool memory_map = false,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
ParquetFileReader* raw_reader() { return raw_reader_.get(); }
|
||||
|
||||
/// Set Arrow MemoryPool for memory allocation
|
||||
FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
|
||||
/// Set Arrow reader properties
|
||||
FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
|
||||
/// Build FileReader instance
|
||||
::arrow::Status Build(std::unique_ptr<FileReader>* out);
|
||||
::arrow::Result<std::unique_ptr<FileReader>> Build();
|
||||
|
||||
private:
|
||||
::arrow::MemoryPool* pool_;
|
||||
ArrowReaderProperties properties_;
|
||||
std::unique_ptr<ParquetFileReader> raw_reader_;
|
||||
};
|
||||
|
||||
/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Build FileReader from Arrow file and MemoryPool
|
||||
///
|
||||
/// Advanced settings are supported through the FileReaderBuilder class.
|
||||
PARQUET_EXPORT
|
||||
::arrow::Result<std::unique_ptr<FileReader>> OpenFile(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator);
|
||||
|
||||
/// @}
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
|
||||
std::shared_ptr<::arrow::Scalar>* min,
|
||||
std::shared_ptr<::arrow::Scalar>* max);
|
||||
|
||||
namespace internal {
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,184 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
#include "parquet/level_conversion.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ArrowReaderProperties;
|
||||
class ArrowWriterProperties;
|
||||
class WriterProperties;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
|
||||
/// schema into a Parquet schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
schema::NodePtr* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
|
||||
/// schema into an Arrow schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(
|
||||
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
const ArrowReaderProperties& properties,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Bridge between an arrow::Field and parquet column indices.
|
||||
struct PARQUET_EXPORT SchemaField {
|
||||
std::shared_ptr<::arrow::Field> field;
|
||||
std::vector<SchemaField> children;
|
||||
|
||||
// Only set for leaf nodes
|
||||
int column_index = -1;
|
||||
|
||||
parquet::internal::LevelInfo level_info;
|
||||
|
||||
bool is_leaf() const { return column_index != -1; }
|
||||
};
|
||||
|
||||
/// \brief Bridge between a parquet Schema and an arrow Schema.
|
||||
///
|
||||
/// Expose parquet columns as a tree structure. Useful traverse and link
|
||||
/// between arrow's Schema and parquet's Schema.
|
||||
struct PARQUET_EXPORT SchemaManifest {
|
||||
static ::arrow::Status Make(
|
||||
const SchemaDescriptor* schema,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
|
||||
const ArrowReaderProperties& properties, SchemaManifest* manifest);
|
||||
|
||||
const SchemaDescriptor* descr;
|
||||
std::shared_ptr<::arrow::Schema> origin_schema;
|
||||
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
|
||||
std::vector<SchemaField> schema_fields;
|
||||
|
||||
std::unordered_map<int, const SchemaField*> column_index_to_field;
|
||||
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
|
||||
|
||||
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
|
||||
auto it = column_index_to_field.find(column_index);
|
||||
if (it == column_index_to_field.end()) {
|
||||
return ::arrow::Status::KeyError("Column index ", column_index,
|
||||
" not found in schema manifest, may be malformed");
|
||||
}
|
||||
*out = it->second;
|
||||
return ::arrow::Status::OK();
|
||||
}
|
||||
|
||||
const SchemaField* GetParent(const SchemaField* field) const {
|
||||
// Returns nullptr also if not found
|
||||
auto it = child_to_parent.find(field);
|
||||
if (it == child_to_parent.end()) {
|
||||
return NULLPTR;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
/// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
|
||||
/// correspond to the column root (first node below the parquet schema's root group) of
|
||||
/// each leaf referenced in column_indices.
|
||||
///
|
||||
/// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
|
||||
/// the roots are `a` and `i` (return=[0,2]).
|
||||
///
|
||||
/// root
|
||||
/// -- a <------
|
||||
/// -- -- b | |
|
||||
/// -- -- -- c |
|
||||
/// -- -- -- d |
|
||||
/// -- -- -- -- e
|
||||
/// -- f
|
||||
/// -- -- g
|
||||
/// -- -- -- h
|
||||
/// -- i <---
|
||||
/// -- -- j |
|
||||
/// -- -- -- k
|
||||
::arrow::Result<std::vector<int>> GetFieldIndices(
|
||||
const std::vector<int>& column_indices) const {
|
||||
const schema::GroupNode* group = descr->group_node();
|
||||
std::unordered_set<int> already_added;
|
||||
|
||||
std::vector<int> out;
|
||||
for (int column_idx : column_indices) {
|
||||
if (column_idx < 0 || column_idx >= descr->num_columns()) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
auto field_node = descr->GetColumnRoot(column_idx);
|
||||
auto field_idx = group->FieldIndex(*field_node);
|
||||
if (field_idx == -1) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
if (already_added.insert(field_idx).second) {
|
||||
out.push_back(field_idx);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,487 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/array/builder_decimal.h"
|
||||
#include "arrow/array/builder_primitive.h"
|
||||
#include "arrow/testing/gtest_util.h"
|
||||
#include "arrow/testing/random.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/float16.h"
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/test_util.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
using internal::RecordReader;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
using ::arrow::Array;
|
||||
using ::arrow::ChunkedArray;
|
||||
using ::arrow::Status;
|
||||
|
||||
template <typename T, int32_t PRECISION, typename = ::arrow::enable_if_decimal<T>>
|
||||
struct DecimalWithPrecisionAndScale {
|
||||
using type = T;
|
||||
static_assert(PRECISION >= T::kMinPrecision && PRECISION <= T::kMaxPrecision,
|
||||
"Invalid precision value");
|
||||
static constexpr ::arrow::Type::type type_id = T::type_id;
|
||||
static constexpr int32_t precision = PRECISION;
|
||||
static constexpr int32_t scale = PRECISION - 1;
|
||||
};
|
||||
template <int32_t PRECISION>
|
||||
using Decimal32WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal32Type, PRECISION>;
|
||||
template <int32_t PRECISION>
|
||||
using Decimal64WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal64Type, PRECISION>;
|
||||
template <int32_t PRECISION>
|
||||
using Decimal128WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal128Type, PRECISION>;
|
||||
template <int32_t PRECISION>
|
||||
using Decimal256WithPrecisionAndScale =
|
||||
DecimalWithPrecisionAndScale<::arrow::Decimal256Type, PRECISION>;
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_floating_point<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using c_type = typename ArrowType::c_type;
|
||||
std::vector<c_type> values;
|
||||
if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
|
||||
values.resize(size);
|
||||
test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(0.0f),
|
||||
::arrow::util::Float16(1.0f), values.data());
|
||||
} else {
|
||||
::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1),
|
||||
&values);
|
||||
}
|
||||
::arrow::NumericBuilder<ArrowType> builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_integer<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
::arrow::randint(size, 0, 64, &values);
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_date<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
::arrow::randint(size, 0, 24, &values);
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
values[i] *= 86400000;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_base_binary<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
BuilderType builder;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
RETURN_NOT_OK(builder.Append("test-string"));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
|
||||
size_t size, std::shared_ptr<Array>* out) {
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
// set byte_width to the length of "fixed": 5
|
||||
// todo: find a way to generate test data with more diversity.
|
||||
BuilderType builder(::arrow::fixed_size_binary(5));
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
RETURN_NOT_OK(builder.Append("fixed"));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <int32_t byte_width>
|
||||
static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
|
||||
auto gen = ::arrow::random::RandomArrayGenerator(seed);
|
||||
std::shared_ptr<Array> decimals;
|
||||
if constexpr (byte_width == 4) {
|
||||
decimals = gen.Decimal32(::arrow::decimal32(precision, 0), n);
|
||||
} else if constexpr (byte_width == 8) {
|
||||
decimals = gen.Decimal64(::arrow::decimal64(precision, 0), n);
|
||||
} else if constexpr (byte_width == 16) {
|
||||
decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
|
||||
} else {
|
||||
decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
|
||||
}
|
||||
std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
|
||||
typename ArrowType::type, precision>>,
|
||||
Status>
|
||||
NonNullArray(size_t size, std::shared_ptr<Array>* out) {
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = ArrowType::scale;
|
||||
|
||||
const auto type =
|
||||
std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
|
||||
const int32_t byte_width = type->byte_width();
|
||||
|
||||
constexpr int32_t seed = 0;
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
random_decimals<ArrowType::type::kByteWidth>(size, seed, kDecimalPrecision,
|
||||
out_buf->mutable_data());
|
||||
|
||||
using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
|
||||
Builder builder(type);
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_boolean<ArrowType, Status> NonNullArray(size_t size,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<uint8_t> values;
|
||||
::arrow::randint(size, 0, 1, &values);
|
||||
::arrow::BooleanBuilder builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
|
||||
using c_type = typename ArrowType::c_type;
|
||||
std::vector<c_type> values;
|
||||
if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
|
||||
values.resize(size);
|
||||
test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(-1e4f),
|
||||
::arrow::util::Float16(1e4f), values.data());
|
||||
} else {
|
||||
::arrow::random_real(size, seed, static_cast<c_type>(-1e10),
|
||||
static_cast<c_type>(1e10), &values);
|
||||
}
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
::arrow::NumericBuilder<ArrowType> builder;
|
||||
if (values.size() > 0) {
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_integer<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
::arrow::randint(size, 0, 64, &values);
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_date<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<typename ArrowType::c_type> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
::arrow::randint(size, 0, 24, &values);
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
values[i] *= 86400000;
|
||||
}
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
// Passing data type so this will work with TimestampType too
|
||||
::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
|
||||
::arrow::default_memory_pool());
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet.
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_base_binary<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
BuilderType builder;
|
||||
|
||||
const int kBufferSize = 10;
|
||||
uint8_t buffer[kBufferSize];
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (!valid_bytes[i]) {
|
||||
RETURN_NOT_OK(builder.AppendNull());
|
||||
} else {
|
||||
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
|
||||
if (ArrowType::is_utf8) {
|
||||
// Trivially force data to be valid UTF8 by making it all ASCII
|
||||
for (auto& byte : buffer) {
|
||||
byte &= 0x7f;
|
||||
}
|
||||
}
|
||||
RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
|
||||
}
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet,
|
||||
// same as NullableArray<String|Binary>(..)
|
||||
template <typename ArrowType>
|
||||
::arrow::enable_if_fixed_size_binary<ArrowType, Status> NullableArray(
|
||||
size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
|
||||
const int byte_width = 10;
|
||||
BuilderType builder(::arrow::fixed_size_binary(byte_width));
|
||||
|
||||
const int kBufferSize = byte_width;
|
||||
uint8_t buffer[kBufferSize];
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (!valid_bytes[i]) {
|
||||
RETURN_NOT_OK(builder.AppendNull());
|
||||
} else {
|
||||
::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
|
||||
RETURN_NOT_OK(builder.Append(buffer));
|
||||
}
|
||||
}
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
template <typename ArrowType, int32_t precision = ArrowType::precision>
|
||||
::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
|
||||
typename ArrowType::type, precision>>,
|
||||
Status>
|
||||
NullableArray(size_t size, size_t num_nulls, uint32_t seed,
|
||||
std::shared_ptr<::arrow::Array>* out) {
|
||||
std::vector<uint8_t> valid_bytes(size, '\1');
|
||||
|
||||
for (size_t i = 0; i < num_nulls; ++i) {
|
||||
valid_bytes[i * 2] = '\0';
|
||||
}
|
||||
|
||||
constexpr int32_t kDecimalPrecision = precision;
|
||||
constexpr int32_t kDecimalScale = ArrowType::scale;
|
||||
|
||||
const auto type =
|
||||
std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
|
||||
const int32_t byte_width = type->byte_width();
|
||||
|
||||
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
|
||||
random_decimals<ArrowType::type::kByteWidth>(size, seed, precision,
|
||||
out_buf->mutable_data());
|
||||
|
||||
using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
|
||||
Builder builder(type);
|
||||
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
// This helper function only supports (size/2) nulls yet.
|
||||
template <class ArrowType>
|
||||
::arrow::enable_if_boolean<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
|
||||
uint32_t seed,
|
||||
std::shared_ptr<Array>* out) {
|
||||
std::vector<uint8_t> values;
|
||||
|
||||
// Seed is random in Arrow right now
|
||||
(void)seed;
|
||||
|
||||
::arrow::randint(size, 0, 1, &values);
|
||||
std::vector<uint8_t> valid_bytes(size, 1);
|
||||
|
||||
for (size_t i = 0; i < num_nulls; i++) {
|
||||
valid_bytes[i * 2] = 0;
|
||||
}
|
||||
|
||||
::arrow::BooleanBuilder builder;
|
||||
RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
|
||||
return builder.Finish(out);
|
||||
}
|
||||
|
||||
/// Wrap an Array into a ListArray by splitting it up into size lists.
|
||||
///
|
||||
/// This helper function only supports (size/2) nulls.
|
||||
Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
|
||||
int64_t null_count, const std::string& item_name,
|
||||
bool nullable_values, std::shared_ptr<::arrow::ListArray>* out) {
|
||||
// We always include an empty list
|
||||
int64_t non_null_entries = size - null_count - 1;
|
||||
int64_t length_per_entry = values->length() / non_null_entries;
|
||||
|
||||
auto offsets = AllocateBuffer();
|
||||
RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
|
||||
int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
|
||||
|
||||
auto null_bitmap = AllocateBuffer();
|
||||
int64_t bitmap_size = ::arrow::bit_util::BytesForBits(size);
|
||||
RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
|
||||
uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
|
||||
memset(null_bitmap_ptr, 0, bitmap_size);
|
||||
|
||||
int32_t current_offset = 0;
|
||||
for (int64_t i = 0; i < size; i++) {
|
||||
offsets_ptr[i] = current_offset;
|
||||
if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
|
||||
// Non-null list (list with index 1 is always empty).
|
||||
::arrow::bit_util::SetBit(null_bitmap_ptr, i);
|
||||
if (i != 1) {
|
||||
current_offset += static_cast<int32_t>(length_per_entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
offsets_ptr[size] = static_cast<int32_t>(values->length());
|
||||
|
||||
auto value_field = ::arrow::field(item_name, values->type(), nullable_values);
|
||||
*out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
|
||||
values, null_bitmap, null_count);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Make an array containing only empty lists, with a null values array
|
||||
Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
|
||||
// Allocate an offsets buffer containing only zeroes
|
||||
const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
|
||||
ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, ::arrow::AllocateBuffer(offsets_nbytes));
|
||||
memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
|
||||
|
||||
auto value_field =
|
||||
::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
|
||||
auto list_type = ::arrow::list(value_field);
|
||||
|
||||
std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
|
||||
nullptr /* values */};
|
||||
auto child_data =
|
||||
::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
|
||||
|
||||
std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
|
||||
std::move(offsets_buffer)};
|
||||
auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
|
||||
array_data->child_data.push_back(child_data);
|
||||
|
||||
*out_array = ::arrow::MakeArray(array_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::shared_ptr<::arrow::Table> MakeSimpleTable(
|
||||
const std::shared_ptr<ChunkedArray>& values, bool nullable) {
|
||||
auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)});
|
||||
return ::arrow::Table::Make(schema, {values});
|
||||
}
|
||||
|
||||
std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
|
||||
bool nullable) {
|
||||
auto carr = std::make_shared<::arrow::ChunkedArray>(values);
|
||||
return MakeSimpleTable(carr, nullable);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ExpectArray(T* expected, Array* result) {
|
||||
auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
|
||||
for (int i = 0; i < result->length(); i++) {
|
||||
EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ArrowType>
|
||||
void ExpectArrayT(void* expected, Array* result) {
|
||||
::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
|
||||
for (int64_t i = 0; i < result->length(); i++) {
|
||||
EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
|
||||
reinterpret_cast<const typename ArrowType::c_type*>(
|
||||
p_array->values()->data())[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
|
||||
::arrow::BooleanBuilder builder;
|
||||
ARROW_EXPECT_OK(
|
||||
builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
|
||||
|
||||
std::shared_ptr<Array> expected_array;
|
||||
ARROW_EXPECT_OK(builder.Finish(&expected_array));
|
||||
EXPECT_TRUE(result->Equals(*expected_array));
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,176 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class ChunkedArray;
|
||||
class RecordBatch;
|
||||
class Schema;
|
||||
class Table;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class FileMetaData;
|
||||
class ParquetFileWriter;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Iterative FileWriter class
|
||||
///
|
||||
/// For basic usage, can write a Table at a time, creating one or more row
|
||||
/// groups per write call.
|
||||
///
|
||||
/// For advanced usage, can write column-by-column: Start a new RowGroup or
|
||||
/// Chunk with NewRowGroup, then write column-by-column the whole column chunk.
|
||||
///
|
||||
/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
|
||||
/// value is a nonnegative integer, then it will be used as the field_id in the parquet
|
||||
/// file.
|
||||
class PARQUET_EXPORT FileWriter {
|
||||
public:
|
||||
static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
|
||||
std::shared_ptr<::arrow::Schema> schema,
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties,
|
||||
std::unique_ptr<FileWriter>* out);
|
||||
|
||||
/// \brief Try to create an Arrow to Parquet file writer.
|
||||
///
|
||||
/// \param schema schema of data that will be passed.
|
||||
/// \param pool memory pool to use.
|
||||
/// \param sink output stream to write Parquet data.
|
||||
/// \param properties general Parquet writer properties.
|
||||
/// \param arrow_properties Arrow-specific writer properties.
|
||||
///
|
||||
/// \since 11.0.0
|
||||
static ::arrow::Result<std::unique_ptr<FileWriter>> Open(
|
||||
const ::arrow::Schema& schema, MemoryPool* pool,
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties =
|
||||
default_arrow_writer_properties());
|
||||
|
||||
/// Return the Arrow schema to be written to.
|
||||
virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
|
||||
|
||||
/// \brief Write a Table to Parquet.
|
||||
///
|
||||
/// \param table Arrow table to write.
|
||||
/// \param chunk_size maximum number of rows to write per row group.
|
||||
virtual ::arrow::Status WriteTable(
|
||||
const ::arrow::Table& table, int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH) = 0;
|
||||
|
||||
/// \brief Start a new row group.
|
||||
///
|
||||
/// Returns an error if not all columns have been written.
|
||||
virtual ::arrow::Status NewRowGroup() = 0;
|
||||
|
||||
/// \brief Write ColumnChunk in row group using an array.
|
||||
virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
|
||||
|
||||
/// \brief Write ColumnChunk in row group using slice of a ChunkedArray
|
||||
virtual ::arrow::Status WriteColumnChunk(
|
||||
const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
|
||||
int64_t size) = 0;
|
||||
|
||||
/// \brief Write ColumnChunk in a row group using a ChunkedArray
|
||||
virtual ::arrow::Status WriteColumnChunk(
|
||||
const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
|
||||
|
||||
/// \brief Start a new buffered row group.
|
||||
///
|
||||
/// Returns an error if not all columns have been written.
|
||||
virtual ::arrow::Status NewBufferedRowGroup() = 0;
|
||||
|
||||
/// \brief Write a RecordBatch into the buffered row group.
|
||||
///
|
||||
/// Multiple RecordBatches can be written into the same row group
|
||||
/// through this method.
|
||||
///
|
||||
/// WriterProperties.max_row_group_length() is respected and a new
|
||||
/// row group will be created if the current row group exceeds the
|
||||
/// limit.
|
||||
///
|
||||
/// Batches get flushed to the output stream once NewBufferedRowGroup()
|
||||
/// or Close() is called.
|
||||
///
|
||||
/// WARNING: If you are writing multiple files in parallel in the same
|
||||
/// executor, deadlock may occur if ArrowWriterProperties::use_threads
|
||||
/// is set to true to write columns in parallel. Please disable use_threads
|
||||
/// option in this case.
|
||||
virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch) = 0;
|
||||
|
||||
/// \brief Write the footer and close the file.
|
||||
virtual ::arrow::Status Close() = 0;
|
||||
virtual ~FileWriter();
|
||||
|
||||
virtual MemoryPool* memory_pool() const = 0;
|
||||
/// \brief Add key-value metadata to the file.
|
||||
/// \param[in] key_value_metadata the metadata to add.
|
||||
/// \note This will overwrite any existing metadata with the same key.
|
||||
/// \return Error if Close() has been called.
|
||||
///
|
||||
/// WARNING: If `store_schema` is enabled, `ARROW:schema` would be stored
|
||||
/// in the key-value metadata. Overwriting this key would result in
|
||||
/// `store_schema` being unusable during read.
|
||||
virtual ::arrow::Status AddKeyValueMetadata(
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
|
||||
/// \brief Return the file metadata, only available after calling Close().
|
||||
virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
/// \brief Write a Table to Parquet.
|
||||
///
|
||||
/// This writes one table in a single shot. To write a Parquet file with
|
||||
/// multiple tables iteratively, see parquet::arrow::FileWriter.
|
||||
///
|
||||
/// \param table Table to write.
|
||||
/// \param pool memory pool to use.
|
||||
/// \param sink output stream to write Parquet data.
|
||||
/// \param chunk_size maximum number of rows to write per row group.
|
||||
/// \param properties general Parquet writer properties.
|
||||
/// \param arrow_properties Arrow-specific writer properties.
|
||||
::arrow::Status PARQUET_EXPORT
|
||||
WriteTable(const ::arrow::Table& table, MemoryPool* pool,
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH,
|
||||
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
|
||||
std::shared_ptr<ArrowWriterProperties> arrow_properties =
|
||||
default_arrow_writer_properties());
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,47 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet::benchmarks {
|
||||
|
||||
template <typename T>
|
||||
void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data,
|
||||
std::vector<uint8_t>* heap, uint32_t data_string_length);
|
||||
|
||||
#define _GENERATE_BENCHMARK_DATA_DECL(KLASS) \
|
||||
template <> \
|
||||
void GenerateBenchmarkData(uint32_t size, uint32_t seed, KLASS* data, \
|
||||
std::vector<uint8_t>* heap, uint32_t data_string_length);
|
||||
|
||||
_GENERATE_BENCHMARK_DATA_DECL(int32_t)
|
||||
_GENERATE_BENCHMARK_DATA_DECL(int64_t)
|
||||
_GENERATE_BENCHMARK_DATA_DECL(float)
|
||||
_GENERATE_BENCHMARK_DATA_DECL(double)
|
||||
_GENERATE_BENCHMARK_DATA_DECL(ByteArray)
|
||||
_GENERATE_BENCHMARK_DATA_DECL(FLBA)
|
||||
_GENERATE_BENCHMARK_DATA_DECL(Int96)
|
||||
|
||||
#undef _GENERATE_BENCHMARK_DATA_DECL
|
||||
|
||||
} // namespace parquet::benchmarks
|
||||
@@ -0,0 +1,363 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "parquet/hasher.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
// A Bloom filter is a compact structure to indicate whether an item is not in a set or
|
||||
// probably in a set. The Bloom filter usually consists of a bit set that represents a
|
||||
// set of elements, a hash strategy and a Bloom filter algorithm.
|
||||
class PARQUET_EXPORT BloomFilter {
|
||||
public:
|
||||
// Maximum Bloom filter size, it sets to HDFS default block size 128MB
|
||||
// This value will be reconsidered when implementing Bloom filter producer.
|
||||
static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
|
||||
|
||||
/// Determine whether an element exist in set or not.
|
||||
///
|
||||
/// @param hash the element to contain.
|
||||
/// @return false if value is definitely not in set, and true means PROBABLY
|
||||
/// in set.
|
||||
virtual bool FindHash(uint64_t hash) const = 0;
|
||||
|
||||
/// Insert element to set represented by Bloom filter bitset.
|
||||
/// @param hash the hash of value to insert into Bloom filter.
|
||||
virtual void InsertHash(uint64_t hash) = 0;
|
||||
|
||||
/// Insert elements to set represented by Bloom filter bitset.
|
||||
/// @param hashes the hash values to insert into Bloom filter.
|
||||
/// @param num_values the number of hash values to insert.
|
||||
virtual void InsertHashes(const uint64_t* hashes, int num_values) = 0;
|
||||
|
||||
/// Write this Bloom filter to an output stream. A Bloom filter structure should
|
||||
/// include bitset length, hash strategy, algorithm, and bitset.
|
||||
///
|
||||
/// @param sink the output stream to write
|
||||
virtual void WriteTo(ArrowOutputStream* sink) const = 0;
|
||||
|
||||
/// Get the number of bytes of bitset
|
||||
virtual uint32_t GetBitsetSize() const = 0;
|
||||
|
||||
/// Compute hash for 32 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int32_t value) const = 0;
|
||||
|
||||
/// Compute hash for 64 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int64_t value) const = 0;
|
||||
|
||||
/// Compute hash for float value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(float value) const = 0;
|
||||
|
||||
/// Compute hash for double value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(double value) const = 0;
|
||||
|
||||
/// Compute hash for Int96 value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const Int96* value) const = 0;
|
||||
|
||||
/// Compute hash for ByteArray value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const ByteArray* value) const = 0;
|
||||
|
||||
/// Compute hash for fixed byte array value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value address.
|
||||
/// @param len the value length.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
|
||||
|
||||
/// Batch compute hashes for 32 bits values by using its plain encoding result.
|
||||
///
|
||||
/// @param values values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for 64 bits values by using its plain encoding result.
|
||||
///
|
||||
/// @param values values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for float values by using its plain encoding result.
|
||||
///
|
||||
/// @param values values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for double values by using its plain encoding result.
|
||||
///
|
||||
/// @param values values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for Int96 values by using its plain encoding result.
|
||||
///
|
||||
/// @param values values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for ByteArray values by using its plain encoding result.
|
||||
///
|
||||
/// @param values values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const ByteArray* values, int num_values,
|
||||
uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for fixed byte array values by using its plain encoding result.
|
||||
///
|
||||
/// @param values values a pointer to the values to hash.
|
||||
/// @param type_len the value length.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values,
|
||||
uint64_t* hashes) const = 0;
|
||||
|
||||
virtual ~BloomFilter() = default;
|
||||
|
||||
protected:
|
||||
// Hash strategy available for Bloom filter.
|
||||
enum class HashStrategy : uint32_t { XXHASH = 0 };
|
||||
|
||||
// Bloom filter algorithm.
|
||||
enum class Algorithm : uint32_t { BLOCK = 0 };
|
||||
|
||||
enum class CompressionStrategy : uint32_t { UNCOMPRESSED = 0 };
|
||||
};
|
||||
|
||||
/// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
|
||||
/// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
|
||||
/// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
|
||||
///
|
||||
/// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
|
||||
/// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
|
||||
class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
|
||||
public:
|
||||
/// The constructor of BlockSplitBloomFilter. It uses XXH64 as hash function.
|
||||
///
|
||||
/// \param pool memory pool to use.
|
||||
explicit BlockSplitBloomFilter(
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
/// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
|
||||
/// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
|
||||
/// rounded up/down to lower/upper bound if num_bytes is out of range and also
|
||||
/// will be rounded up to a power of 2.
|
||||
///
|
||||
/// @param num_bytes The number of bytes to store Bloom filter bitset.
|
||||
void Init(uint32_t num_bytes);
|
||||
|
||||
/// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
|
||||
/// bitset because the given bitset may not satisfy the 32-byte alignment requirement
|
||||
/// which may lead to segfault when performing SIMD instructions. It is the caller's
|
||||
/// responsibility to free the bitset passed in. This is used when reconstructing
|
||||
/// a Bloom filter from a parquet file.
|
||||
///
|
||||
/// @param bitset The given bitset to initialize the Bloom filter.
|
||||
/// @param num_bytes The number of bytes of given bitset.
|
||||
void Init(const uint8_t* bitset, uint32_t num_bytes);
|
||||
|
||||
/// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
|
||||
static constexpr uint32_t kMinimumBloomFilterBytes = 32;
|
||||
|
||||
/// Calculate optimal size according to the number of distinct values and false
|
||||
/// positive probability.
|
||||
///
|
||||
/// @param ndv The number of distinct values.
|
||||
/// @param fpp The false positive probability.
|
||||
/// @return it always return a value between kMinimumBloomFilterBytes and
|
||||
/// kMaximumBloomFilterBytes, and the return value is always a power of 2
|
||||
static uint32_t OptimalNumOfBytes(uint32_t ndv, double fpp) {
|
||||
uint32_t optimal_num_of_bits = OptimalNumOfBits(ndv, fpp);
|
||||
ARROW_DCHECK(::arrow::bit_util::IsMultipleOf8(optimal_num_of_bits));
|
||||
return optimal_num_of_bits >> 3;
|
||||
}
|
||||
|
||||
/// Calculate optimal size according to the number of distinct values and false
|
||||
/// positive probability.
|
||||
///
|
||||
/// @param ndv The number of distinct values.
|
||||
/// @param fpp The false positive probability.
|
||||
/// @return it always return a value between kMinimumBloomFilterBytes * 8 and
|
||||
/// kMaximumBloomFilterBytes * 8, and the return value is always a power of 16
|
||||
static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
|
||||
ARROW_DCHECK(fpp > 0.0 && fpp < 1.0);
|
||||
const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
|
||||
uint32_t num_bits;
|
||||
|
||||
// Handle overflow.
|
||||
if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
|
||||
num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
|
||||
} else {
|
||||
num_bits = static_cast<uint32_t>(m);
|
||||
}
|
||||
|
||||
// Round up to lower bound
|
||||
if (num_bits < kMinimumBloomFilterBytes << 3) {
|
||||
num_bits = kMinimumBloomFilterBytes << 3;
|
||||
}
|
||||
|
||||
// Get next power of 2 if bits is not power of 2.
|
||||
if ((num_bits & (num_bits - 1)) != 0) {
|
||||
num_bits = static_cast<uint32_t>(::arrow::bit_util::NextPower2(num_bits));
|
||||
}
|
||||
|
||||
// Round down to upper bound
|
||||
if (num_bits > kMaximumBloomFilterBytes << 3) {
|
||||
num_bits = kMaximumBloomFilterBytes << 3;
|
||||
}
|
||||
|
||||
return num_bits;
|
||||
}
|
||||
|
||||
bool FindHash(uint64_t hash) const override;
|
||||
void InsertHash(uint64_t hash) override;
|
||||
void InsertHashes(const uint64_t* hashes, int num_values) override;
|
||||
void WriteTo(ArrowOutputStream* sink) const override;
|
||||
uint32_t GetBitsetSize() const override { return num_bytes_; }
|
||||
|
||||
uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(float value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(double value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
|
||||
uint64_t Hash(const FLBA* value, uint32_t len) const override {
|
||||
return hasher_->Hash(value, len);
|
||||
}
|
||||
|
||||
void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override {
|
||||
hasher_->Hashes(values, num_values, hashes);
|
||||
}
|
||||
void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override {
|
||||
hasher_->Hashes(values, num_values, hashes);
|
||||
}
|
||||
void Hashes(const float* values, int num_values, uint64_t* hashes) const override {
|
||||
hasher_->Hashes(values, num_values, hashes);
|
||||
}
|
||||
void Hashes(const double* values, int num_values, uint64_t* hashes) const override {
|
||||
hasher_->Hashes(values, num_values, hashes);
|
||||
}
|
||||
void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override {
|
||||
hasher_->Hashes(values, num_values, hashes);
|
||||
}
|
||||
void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override {
|
||||
hasher_->Hashes(values, num_values, hashes);
|
||||
}
|
||||
void Hashes(const FLBA* values, uint32_t type_len, int num_values,
|
||||
uint64_t* hashes) const override {
|
||||
hasher_->Hashes(values, type_len, num_values, hashes);
|
||||
}
|
||||
|
||||
uint64_t Hash(const int32_t* value) const { return hasher_->Hash(*value); }
|
||||
uint64_t Hash(const int64_t* value) const { return hasher_->Hash(*value); }
|
||||
uint64_t Hash(const float* value) const { return hasher_->Hash(*value); }
|
||||
uint64_t Hash(const double* value) const { return hasher_->Hash(*value); }
|
||||
|
||||
/// Deserialize the Bloom filter from an input stream. It is used when reconstructing
|
||||
/// a Bloom filter from a parquet filter.
|
||||
///
|
||||
/// @param properties The parquet reader properties.
|
||||
/// @param input_stream The input stream from which to construct the bloom filter.
|
||||
/// @param bloom_filter_length The length of the serialized bloom filter including
|
||||
/// header.
|
||||
/// @return The BlockSplitBloomFilter.
|
||||
static BlockSplitBloomFilter Deserialize(
|
||||
const ReaderProperties& properties, ArrowInputStream* input_stream,
|
||||
std::optional<int64_t> bloom_filter_length = std::nullopt);
|
||||
|
||||
private:
|
||||
inline void InsertHashImpl(uint64_t hash);
|
||||
|
||||
// Bytes in a tiny Bloom filter block.
|
||||
static constexpr int kBytesPerFilterBlock = 32;
|
||||
|
||||
// The number of bits to be set in each tiny Bloom filter
|
||||
static constexpr int kBitsSetPerBlock = 8;
|
||||
|
||||
// A mask structure used to set bits in each tiny Bloom filter.
|
||||
struct BlockMask {
|
||||
uint32_t item[kBitsSetPerBlock];
|
||||
};
|
||||
|
||||
// The block-based algorithm needs eight odd SALT values to calculate eight indexes
|
||||
// of bit to set, one bit in each 32-bit word.
|
||||
static constexpr uint32_t SALT[kBitsSetPerBlock] = {
|
||||
0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
|
||||
0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
|
||||
|
||||
// Memory pool to allocate aligned buffer for bitset
|
||||
::arrow::MemoryPool* pool_;
|
||||
|
||||
// The underlying buffer of bitset.
|
||||
std::shared_ptr<Buffer> data_;
|
||||
|
||||
// The number of bytes of Bloom filter bitset.
|
||||
uint32_t num_bytes_;
|
||||
|
||||
// Hash strategy used in this Bloom filter.
|
||||
HashStrategy hash_strategy_;
|
||||
|
||||
// Algorithm used in this Bloom filter.
|
||||
Algorithm algorithm_;
|
||||
|
||||
// Compression used in this Bloom filter.
|
||||
CompressionStrategy compression_strategy_;
|
||||
|
||||
// The hash pointer points to actual hash class used.
|
||||
std::unique_ptr<Hasher> hasher_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,68 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/type_fwd.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class InternalFileDecryptor;
|
||||
class BloomFilter;
|
||||
|
||||
class PARQUET_EXPORT RowGroupBloomFilterReader {
|
||||
public:
|
||||
virtual ~RowGroupBloomFilterReader() = default;
|
||||
|
||||
/// \brief Read bloom filter of a column chunk.
|
||||
///
|
||||
/// \param[in] i column ordinal of the column chunk.
|
||||
/// \returns bloom filter of the column or nullptr if it does not exist.
|
||||
/// \throws ParquetException if the index is out of bound, or read bloom
|
||||
/// filter failed.
|
||||
virtual std::unique_ptr<BloomFilter> GetColumnBloomFilter(int i) = 0;
|
||||
};
|
||||
|
||||
/// \brief Interface for reading the bloom filter for a Parquet file.
|
||||
class PARQUET_EXPORT BloomFilterReader {
|
||||
public:
|
||||
virtual ~BloomFilterReader() = default;
|
||||
|
||||
/// \brief Create a BloomFilterReader instance.
|
||||
/// \returns a BloomFilterReader instance.
|
||||
/// WARNING: The returned BloomFilterReader references to all the input parameters, so
|
||||
/// it must not outlive all of the input parameters. Usually these input parameters
|
||||
/// come from the same ParquetFileReader object, so it must not outlive the reader
|
||||
/// that creates this BloomFilterReader.
|
||||
static std::unique_ptr<BloomFilterReader> Make(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> input,
|
||||
std::shared_ptr<FileMetaData> file_metadata, const ReaderProperties& properties,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
/// \brief Get the bloom filter reader of a specific row group.
|
||||
/// \param[in] i row group ordinal to get bloom filter reader.
|
||||
/// \returns RowGroupBloomFilterReader of the specified row group. A nullptr may or may
|
||||
/// not be returned if the bloom filter for the row group is unavailable. It
|
||||
/// is the caller's responsibility to check the return value of follow-up calls
|
||||
/// to the RowGroupBloomFilterReader.
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
virtual std::shared_ptr<RowGroupBloomFilterReader> RowGroup(int i) = 0;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,179 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module defines an abstract interface for iterating through pages in a
|
||||
// Parquet column chunk within a row group. It could be extended in the future
|
||||
// to iterate through all data pages in all chunks in a file.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "parquet/size_statistics.h"
|
||||
#include "parquet/statistics.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
// TODO: Parallel processing is not yet safe because of memory-ownership
|
||||
// semantics (the PageReader may or may not own the memory referenced by a
|
||||
// page)
|
||||
//
|
||||
// TODO(wesm): In the future Parquet implementations may store the crc code
|
||||
// in format::PageHeader. parquet-mr currently does not, so we also skip it
|
||||
// here, both on the read and write path
|
||||
class Page {
|
||||
public:
|
||||
Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
|
||||
: buffer_(buffer), type_(type) {}
|
||||
|
||||
PageType::type type() const { return type_; }
|
||||
|
||||
std::shared_ptr<Buffer> buffer() const { return buffer_; }
|
||||
|
||||
// @returns: a pointer to the page's data
|
||||
const uint8_t* data() const { return buffer_->data(); }
|
||||
|
||||
// @returns: the total size in bytes of the page's data buffer
|
||||
int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
|
||||
|
||||
private:
|
||||
std::shared_ptr<Buffer> buffer_;
|
||||
PageType::type type_;
|
||||
};
|
||||
|
||||
/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
|
||||
class DataPage : public Page {
|
||||
public:
|
||||
int32_t num_values() const { return num_values_; }
|
||||
Encoding::type encoding() const { return encoding_; }
|
||||
int64_t uncompressed_size() const { return uncompressed_size_; }
|
||||
const EncodedStatistics& statistics() const { return statistics_; }
|
||||
/// Return the row ordinal within the row group to the first row in the data page.
|
||||
/// Currently it is only present from data pages created by ColumnWriter in order
|
||||
/// to collect page index.
|
||||
std::optional<int64_t> first_row_index() const { return first_row_index_; }
|
||||
const SizeStatistics& size_statistics() const { return size_statistics_; }
|
||||
|
||||
virtual ~DataPage() = default;
|
||||
|
||||
protected:
|
||||
DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
||||
Encoding::type encoding, int64_t uncompressed_size,
|
||||
EncodedStatistics statistics, std::optional<int64_t> first_row_index,
|
||||
SizeStatistics size_statistics)
|
||||
: Page(buffer, type),
|
||||
num_values_(num_values),
|
||||
encoding_(encoding),
|
||||
uncompressed_size_(uncompressed_size),
|
||||
statistics_(std::move(statistics)),
|
||||
first_row_index_(std::move(first_row_index)),
|
||||
size_statistics_(std::move(size_statistics)) {}
|
||||
|
||||
int32_t num_values_;
|
||||
Encoding::type encoding_;
|
||||
int64_t uncompressed_size_;
|
||||
EncodedStatistics statistics_;
|
||||
/// Row ordinal within the row group to the first row in the data page.
|
||||
std::optional<int64_t> first_row_index_;
|
||||
SizeStatistics size_statistics_;
|
||||
};
|
||||
|
||||
class DataPageV1 : public DataPage {
|
||||
public:
|
||||
DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
||||
Encoding::type encoding, Encoding::type definition_level_encoding,
|
||||
Encoding::type repetition_level_encoding, int64_t uncompressed_size,
|
||||
EncodedStatistics statistics = EncodedStatistics(),
|
||||
std::optional<int64_t> first_row_index = std::nullopt,
|
||||
SizeStatistics size_statistics = SizeStatistics())
|
||||
: DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
|
||||
std::move(statistics), std::move(first_row_index),
|
||||
std::move(size_statistics)),
|
||||
definition_level_encoding_(definition_level_encoding),
|
||||
repetition_level_encoding_(repetition_level_encoding) {}
|
||||
|
||||
Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
|
||||
|
||||
Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
|
||||
|
||||
private:
|
||||
Encoding::type definition_level_encoding_;
|
||||
Encoding::type repetition_level_encoding_;
|
||||
};
|
||||
|
||||
class DataPageV2 : public DataPage {
|
||||
public:
|
||||
DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
|
||||
int32_t num_rows, Encoding::type encoding,
|
||||
int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
|
||||
int64_t uncompressed_size, bool is_compressed = false,
|
||||
EncodedStatistics statistics = EncodedStatistics(),
|
||||
std::optional<int64_t> first_row_index = std::nullopt,
|
||||
SizeStatistics size_statistics = SizeStatistics())
|
||||
: DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
|
||||
std::move(statistics), std::move(first_row_index),
|
||||
std::move(size_statistics)),
|
||||
num_nulls_(num_nulls),
|
||||
num_rows_(num_rows),
|
||||
definition_levels_byte_length_(definition_levels_byte_length),
|
||||
repetition_levels_byte_length_(repetition_levels_byte_length),
|
||||
is_compressed_(is_compressed) {}
|
||||
|
||||
int32_t num_nulls() const { return num_nulls_; }
|
||||
|
||||
int32_t num_rows() const { return num_rows_; }
|
||||
|
||||
int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
|
||||
|
||||
int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
|
||||
|
||||
bool is_compressed() const { return is_compressed_; }
|
||||
|
||||
private:
|
||||
int32_t num_nulls_;
|
||||
int32_t num_rows_;
|
||||
int32_t definition_levels_byte_length_;
|
||||
int32_t repetition_levels_byte_length_;
|
||||
bool is_compressed_;
|
||||
};
|
||||
|
||||
class DictionaryPage : public Page {
|
||||
public:
|
||||
DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
|
||||
Encoding::type encoding, bool is_sorted = false)
|
||||
: Page(buffer, PageType::DICTIONARY_PAGE),
|
||||
num_values_(num_values),
|
||||
encoding_(encoding),
|
||||
is_sorted_(is_sorted) {}
|
||||
|
||||
int32_t num_values() const { return num_values_; }
|
||||
|
||||
Encoding::type encoding() const { return encoding_; }
|
||||
|
||||
bool is_sorted() const { return is_sorted_; }
|
||||
|
||||
private:
|
||||
int32_t num_values_;
|
||||
Encoding::type encoding_;
|
||||
bool is_sorted_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,458 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/level_conversion.h"
|
||||
#include "parquet/metadata.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace bit_util {
|
||||
class BitReader;
|
||||
} // namespace bit_util
|
||||
|
||||
namespace util {
|
||||
template <typename T>
|
||||
class RleBitPackedDecoder;
|
||||
} // namespace util
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class Decryptor;
|
||||
class Page;
|
||||
|
||||
// 16 MB is the default maximum page header size
|
||||
static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
|
||||
|
||||
// 16 KB is the default expected page header size
|
||||
static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
|
||||
|
||||
// \brief DataPageStats stores encoded statistics and number of values/rows for
|
||||
// a page.
|
||||
struct PARQUET_EXPORT DataPageStats {
|
||||
DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values,
|
||||
std::optional<int32_t> num_rows)
|
||||
: encoded_statistics(encoded_statistics),
|
||||
num_values(num_values),
|
||||
num_rows(num_rows) {}
|
||||
|
||||
// Encoded statistics extracted from the page header.
|
||||
// Nullptr if there are no statistics in the page header.
|
||||
const EncodedStatistics* encoded_statistics;
|
||||
// Number of values stored in the page. Filled for both V1 and V2 data pages.
|
||||
// For repeated fields, this can be greater than number of rows. For
|
||||
// non-repeated fields, this will be the same as the number of rows.
|
||||
int32_t num_values;
|
||||
// Number of rows stored in the page. std::nullopt if not available.
|
||||
std::optional<int32_t> num_rows;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT LevelDecoder {
|
||||
public:
|
||||
LevelDecoder();
|
||||
~LevelDecoder();
|
||||
|
||||
// Initialize the LevelDecoder state with new data
|
||||
// and return the number of bytes consumed
|
||||
int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
|
||||
const uint8_t* data, int32_t data_size);
|
||||
|
||||
void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
|
||||
const uint8_t* data);
|
||||
|
||||
// Decodes a batch of levels into an array and returns the number of levels decoded
|
||||
int Decode(int batch_size, int16_t* levels);
|
||||
|
||||
private:
|
||||
int bit_width_;
|
||||
int num_values_remaining_;
|
||||
Encoding::type encoding_;
|
||||
std::unique_ptr<::arrow::util::RleBitPackedDecoder<int16_t>> rle_decoder_;
|
||||
std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
|
||||
int16_t max_level_;
|
||||
};
|
||||
|
||||
struct CryptoContext {
|
||||
bool start_decrypt_with_dictionary_page = false;
|
||||
int16_t row_group_ordinal = -1;
|
||||
int16_t column_ordinal = -1;
|
||||
std::function<std::unique_ptr<Decryptor>()> meta_decryptor_factory;
|
||||
std::function<std::unique_ptr<Decryptor>()> data_decryptor_factory;
|
||||
};
|
||||
|
||||
// Abstract page iterator interface. This way, we can feed column pages to the
|
||||
// ColumnReader through whatever mechanism we choose
|
||||
class PARQUET_EXPORT PageReader {
|
||||
using DataPageFilter = std::function<bool(const DataPageStats&)>;
|
||||
|
||||
public:
|
||||
virtual ~PageReader() = default;
|
||||
|
||||
static std::unique_ptr<PageReader> Open(
|
||||
std::shared_ptr<ArrowInputStream> stream, int64_t total_num_values,
|
||||
Compression::type codec, bool always_compressed = false,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
|
||||
const CryptoContext* ctx = NULLPTR);
|
||||
static std::unique_ptr<PageReader> Open(std::shared_ptr<ArrowInputStream> stream,
|
||||
int64_t total_num_values,
|
||||
Compression::type codec,
|
||||
const ReaderProperties& properties,
|
||||
bool always_compressed = false,
|
||||
const CryptoContext* ctx = NULLPTR);
|
||||
|
||||
// If data_page_filter is present (not null), NextPage() will call the
|
||||
// callback function exactly once per page in the order the pages appear in
|
||||
// the column. If the callback function returns true the page will be
|
||||
// skipped. The callback will be called only if the page type is DATA_PAGE or
|
||||
// DATA_PAGE_V2. Dictionary pages will not be skipped.
|
||||
// Caller is responsible for checking that statistics are correct using
|
||||
// ApplicationVersion::HasCorrectStatistics().
|
||||
// \note API EXPERIMENTAL
|
||||
void set_data_page_filter(DataPageFilter data_page_filter) {
|
||||
data_page_filter_ = std::move(data_page_filter);
|
||||
}
|
||||
|
||||
// @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
|
||||
// containing new Page otherwise
|
||||
//
|
||||
// The returned Page may contain references that aren't guaranteed to live
|
||||
// beyond the next call to NextPage().
|
||||
virtual std::shared_ptr<Page> NextPage() = 0;
|
||||
|
||||
virtual void set_max_page_header_size(uint32_t size) = 0;
|
||||
|
||||
protected:
|
||||
// Callback that decides if we should skip a page or not.
|
||||
DataPageFilter data_page_filter_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnReader {
|
||||
public:
|
||||
virtual ~ColumnReader() = default;
|
||||
|
||||
static std::shared_ptr<ColumnReader> Make(
|
||||
const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
// Returns true if there are still values in this column.
|
||||
virtual bool HasNext() = 0;
|
||||
|
||||
virtual Type::type type() const = 0;
|
||||
|
||||
virtual const ColumnDescriptor* descr() const = 0;
|
||||
|
||||
// Get the encoding that can be exposed by this reader. If it returns
|
||||
// dictionary encoding, then ReadBatchWithDictionary can be used to read data.
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
virtual ExposedEncoding GetExposedEncoding() = 0;
|
||||
|
||||
protected:
|
||||
friend class RowGroupReader;
|
||||
// Set the encoding that can be exposed by this reader.
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
|
||||
};
|
||||
|
||||
// API to read values from a single column. This is a main client facing API.
|
||||
template <typename DType>
|
||||
class TypedColumnReader : public ColumnReader {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
// Read a batch of repetition levels, definition levels, and values from the
|
||||
// column.
|
||||
//
|
||||
// Since null values are not stored in the values, the number of values read
|
||||
// may be less than the number of repetition and definition levels. With
|
||||
// nested data this is almost certainly true.
|
||||
//
|
||||
// Set def_levels or rep_levels to nullptr if you want to skip reading them.
|
||||
// This is only safe if you know through some other source that there are no
|
||||
// undefined values.
|
||||
//
|
||||
// To fully exhaust a row group, you must read batches until the number of
|
||||
// values read reaches the number of stored values according to the metadata.
|
||||
//
|
||||
// This API is the same for both V1 and V2 of the DataPage
|
||||
//
|
||||
// @returns: actual number of levels read (see values_read for number of values read)
|
||||
virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
|
||||
T* values, int64_t* values_read) = 0;
|
||||
|
||||
// Skip reading values. This method will work for both repeated and
|
||||
// non-repeated fields. Note that this method is skipping values and not
|
||||
// records. This distinction is important for repeated fields, meaning that
|
||||
// we are not skipping over the values to the next record. For example,
|
||||
// consider the following two consecutive records containing one repeated field:
|
||||
// {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which
|
||||
// is inside the first record.
|
||||
// Returns the number of values skipped.
|
||||
virtual int64_t Skip(int64_t num_values_to_skip) = 0;
|
||||
|
||||
// Read a batch of repetition levels, definition levels, and indices from the
|
||||
// column. And read the dictionary if a dictionary page is encountered during
|
||||
// reading pages. This API is similar to ReadBatch(), with ability to read
|
||||
// dictionary and indices. It is only valid to call this method when the reader can
|
||||
// expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
|
||||
// DICTIONARY).
|
||||
//
|
||||
// The dictionary is read along with the data page. When there's no data page,
|
||||
// the dictionary won't be returned.
|
||||
//
|
||||
// @param batch_size The batch size to read
|
||||
// @param[out] def_levels The Parquet definition levels.
|
||||
// @param[out] rep_levels The Parquet repetition levels.
|
||||
// @param[out] indices The dictionary indices.
|
||||
// @param[out] indices_read The number of indices read.
|
||||
// @param[out] dict The pointer to dictionary values. It will return nullptr if
|
||||
// there's no data page. Each column chunk only has one dictionary page. The dictionary
|
||||
// is owned by the reader, so the caller is responsible for copying the dictionary
|
||||
// values before the reader gets destroyed.
|
||||
// @param[out] dict_len The dictionary length. It will return 0 if there's no data
|
||||
// page.
|
||||
// @returns: actual number of levels read (see indices_read for number of
|
||||
// indices read
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
|
||||
int16_t* rep_levels, int32_t* indices,
|
||||
int64_t* indices_read, const T** dict,
|
||||
int32_t* dict_len) = 0;
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Stateful column reader that delimits semantic records for both flat
|
||||
/// and nested columns
|
||||
///
|
||||
/// \note API EXPERIMENTAL
|
||||
/// \since 1.3.0
|
||||
class PARQUET_EXPORT RecordReader {
|
||||
public:
|
||||
/// \brief Creates a record reader.
|
||||
/// @param descr Column descriptor
|
||||
/// @param leaf_info Level info, used to determine if a column is nullable or not
|
||||
/// @param pool Memory pool to use for buffering values and rep/def levels
|
||||
/// @param read_dictionary True if reading directly as Arrow dictionary-encoded
|
||||
/// @param read_dense_for_nullable True if reading dense and not leaving space for null
|
||||
/// values
|
||||
/// @param arrow_type Which type to read this column as (optional). Currently
|
||||
/// only used for byte array columns (see BinaryRecordReader::GetBuilderChunks).
|
||||
static std::shared_ptr<RecordReader> Make(
|
||||
const ColumnDescriptor* descr, LevelInfo leaf_info,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
|
||||
bool read_dictionary = false, bool read_dense_for_nullable = false,
|
||||
const std::shared_ptr<::arrow::DataType>& arrow_type = NULLPTR);
|
||||
|
||||
virtual ~RecordReader() = default;
|
||||
|
||||
/// \brief Attempt to read indicated number of records from column chunk
|
||||
/// Note that for repeated fields, a record may have more than one value
|
||||
/// and all of them are read. If read_dense_for_nullable() it will
|
||||
/// not leave any space for null values. Otherwise, it will read spaced.
|
||||
/// \return number of records read
|
||||
virtual int64_t ReadRecords(int64_t num_records) = 0;
|
||||
|
||||
/// \brief Attempt to skip indicated number of records from column chunk.
|
||||
/// Note that for repeated fields, a record may have more than one value
|
||||
/// and all of them are skipped.
|
||||
/// \return number of records skipped
|
||||
virtual int64_t SkipRecords(int64_t num_records) = 0;
|
||||
|
||||
/// \brief Pre-allocate space for data. Results in better flat read performance
|
||||
virtual void Reserve(int64_t num_values) = 0;
|
||||
|
||||
/// \brief Clear consumed values and repetition/definition levels as the
|
||||
/// result of calling ReadRecords
|
||||
/// For FLBA and ByteArray types, call GetBuilderChunks() to reset them.
|
||||
virtual void Reset() = 0;
|
||||
|
||||
/// \brief Transfer filled values buffer to caller. A new one will be
|
||||
/// allocated in subsequent ReadRecords calls
|
||||
virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
|
||||
|
||||
/// \brief Transfer filled validity bitmap buffer to caller. A new one will
|
||||
/// be allocated in subsequent ReadRecords calls
|
||||
virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
|
||||
|
||||
/// \brief Return true if the record reader has more internal data yet to
|
||||
/// process
|
||||
virtual bool HasMoreData() const = 0;
|
||||
|
||||
/// \brief Advance record reader to the next row group. Must be set before
|
||||
/// any records could be read/skipped.
|
||||
/// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
|
||||
virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
|
||||
|
||||
/// \brief Returns the underlying column reader's descriptor.
|
||||
virtual const ColumnDescriptor* descr() const = 0;
|
||||
|
||||
virtual void DebugPrintState() = 0;
|
||||
|
||||
/// \brief Returns the dictionary owned by the current decoder. Throws an
|
||||
/// exception if the current decoder is not for dictionary encoding. The caller is
|
||||
/// responsible for casting the returned pointer to proper type depending on the
|
||||
/// column's physical type. An example:
|
||||
/// const ByteArray* dict = reinterpret_cast<const ByteArray*>(ReadDictionary(&len));
|
||||
/// or:
|
||||
/// const float* dict = reinterpret_cast<const float*>(ReadDictionary(&len));
|
||||
/// \param[out] dictionary_length The number of dictionary entries.
|
||||
virtual const void* ReadDictionary(int32_t* dictionary_length) = 0;
|
||||
|
||||
/// \brief Decoded definition levels
|
||||
int16_t* def_levels() const {
|
||||
return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
|
||||
}
|
||||
|
||||
/// \brief Decoded repetition levels
|
||||
int16_t* rep_levels() const {
|
||||
return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
|
||||
}
|
||||
|
||||
/// \brief Decoded values, including nulls, if any
|
||||
/// FLBA and ByteArray types do not use this array and read into their own
|
||||
/// builders.
|
||||
uint8_t* values() const { return values_->mutable_data(); }
|
||||
|
||||
/// \brief Number of values written, including space left for nulls if any.
|
||||
/// If this Reader was constructed with read_dense_for_nullable(), there is no space for
|
||||
/// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For
|
||||
/// FLBA and ByteArray types this value reflects the values written with the last
|
||||
/// ReadRecords call since those readers will reset the values after each call.
|
||||
int64_t values_written() const { return values_written_; }
|
||||
|
||||
/// \brief Number of definition / repetition levels (from those that have
|
||||
/// been decoded) that have been consumed inside the reader.
|
||||
int64_t levels_position() const { return levels_position_; }
|
||||
|
||||
/// \brief Number of definition / repetition levels that have been written
|
||||
/// internally in the reader. This may be larger than values_written() because
|
||||
/// for repeated fields we need to look at the levels in advance to figure out
|
||||
/// the record boundaries.
|
||||
int64_t levels_written() const { return levels_written_; }
|
||||
|
||||
/// \brief Number of nulls in the leaf that we have read so far into the
|
||||
/// values vector. This is only valid when !read_dense_for_nullable(). When
|
||||
/// read_dense_for_nullable() it will always be 0.
|
||||
int64_t null_count() const { return null_count_; }
|
||||
|
||||
/// \brief True if the leaf values are nullable
|
||||
bool nullable_values() const { return nullable_values_; }
|
||||
|
||||
/// \brief True if reading directly as Arrow dictionary-encoded
|
||||
bool read_dictionary() const { return read_dictionary_; }
|
||||
|
||||
/// \brief True if reading dense for nullable columns.
|
||||
bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
|
||||
|
||||
protected:
|
||||
/// \brief Indicates if we can have nullable values. Note that repeated fields
|
||||
/// may or may not be nullable.
|
||||
bool nullable_values_;
|
||||
|
||||
bool at_record_start_;
|
||||
int64_t records_read_;
|
||||
|
||||
/// \brief Stores values. These values are populated based on each ReadRecords
|
||||
/// call. No extra values are buffered for the next call. SkipRecords will not
|
||||
/// add any value to this buffer.
|
||||
std::shared_ptr<::arrow::ResizableBuffer> values_;
|
||||
/// \brief False for FIXED_LEN_BYTE_ARRAY and BYTE_ARRAY, in which case we
|
||||
/// don't allocate the values buffer and we directly read into builder classes.
|
||||
bool uses_values_;
|
||||
|
||||
/// \brief Values that we have read into 'values_' + 'null_count_'.
|
||||
int64_t values_written_;
|
||||
int64_t values_capacity_;
|
||||
int64_t null_count_;
|
||||
|
||||
/// \brief Each bit corresponds to one element in 'values_' and specifies if it
|
||||
/// is null or not null.
|
||||
///
|
||||
/// Not set if leaf type is not nullable or read_dense_for_nullable_ is true.
|
||||
std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
|
||||
|
||||
/// \brief Buffer for definition levels. May contain more levels than
|
||||
/// is actually read. This is because we read levels ahead to
|
||||
/// figure out record boundaries for repeated fields.
|
||||
/// For flat required fields, 'def_levels_' and 'rep_levels_' are not
|
||||
/// populated. For non-repeated fields 'rep_levels_' is not populated.
|
||||
/// 'def_levels_' and 'rep_levels_' must be of the same size if present.
|
||||
std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
|
||||
/// \brief Buffer for repetition levels. Only populated for repeated
|
||||
/// fields.
|
||||
std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
|
||||
|
||||
/// \brief Number of definition / repetition levels that have been written
|
||||
/// internally in the reader. This may be larger than values_written() since
|
||||
/// for repeated fields we need to look at the levels in advance to figure out
|
||||
/// the record boundaries.
|
||||
int64_t levels_written_;
|
||||
/// \brief Position of the next level that should be consumed.
|
||||
int64_t levels_position_;
|
||||
int64_t levels_capacity_;
|
||||
|
||||
bool read_dictionary_ = false;
|
||||
// If true, we will not leave any space for the null values in the values_
|
||||
// vector or fill nulls values in BinaryRecordReader/DictionaryRecordReader.
|
||||
//
|
||||
// If read_dense_for_nullable_ is true, the BinaryRecordReader/DictionaryRecordReader
|
||||
// might still populate the validity bitmap buffer.
|
||||
bool read_dense_for_nullable_ = false;
|
||||
};
|
||||
|
||||
class BinaryRecordReader : virtual public RecordReader {
|
||||
public:
|
||||
virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
|
||||
};
|
||||
|
||||
/// \brief Read records directly to dictionary-encoded Arrow form (int32
|
||||
/// indices). Only valid for BYTE_ARRAY columns
|
||||
class DictionaryRecordReader : virtual public RecordReader {
|
||||
public:
|
||||
virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
using BoolReader = TypedColumnReader<BooleanType>;
|
||||
using Int32Reader = TypedColumnReader<Int32Type>;
|
||||
using Int64Reader = TypedColumnReader<Int64Type>;
|
||||
using Int96Reader = TypedColumnReader<Int96Type>;
|
||||
using FloatReader = TypedColumnReader<FloatType>;
|
||||
using DoubleReader = TypedColumnReader<DoubleType>;
|
||||
using ByteArrayReader = TypedColumnReader<ByteArrayType>;
|
||||
using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,264 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
|
||||
|
||||
class PARQUET_EXPORT Scanner {
|
||||
public:
|
||||
explicit Scanner(std::shared_ptr<ColumnReader> reader,
|
||||
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
|
||||
: batch_size_(batch_size),
|
||||
level_offset_(0),
|
||||
levels_buffered_(0),
|
||||
value_buffer_(AllocateBuffer(pool)),
|
||||
value_offset_(0),
|
||||
values_buffered_(0),
|
||||
reader_(std::move(reader)) {
|
||||
def_levels_.resize(
|
||||
descr()->max_definition_level() > 0 ? static_cast<size_t>(batch_size_) : 0);
|
||||
rep_levels_.resize(
|
||||
descr()->max_repetition_level() > 0 ? static_cast<size_t>(batch_size_) : 0);
|
||||
}
|
||||
|
||||
virtual ~Scanner() {}
|
||||
|
||||
static std::shared_ptr<Scanner> Make(
|
||||
std::shared_ptr<ColumnReader> col_reader,
|
||||
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
|
||||
|
||||
bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
|
||||
|
||||
const ColumnDescriptor* descr() const { return reader_->descr(); }
|
||||
|
||||
int64_t batch_size() const { return batch_size_; }
|
||||
|
||||
void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
|
||||
|
||||
protected:
|
||||
int64_t batch_size_;
|
||||
|
||||
std::vector<int16_t> def_levels_;
|
||||
std::vector<int16_t> rep_levels_;
|
||||
int level_offset_;
|
||||
int levels_buffered_;
|
||||
|
||||
std::shared_ptr<ResizableBuffer> value_buffer_;
|
||||
int value_offset_;
|
||||
int64_t values_buffered_;
|
||||
std::shared_ptr<ColumnReader> reader_;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
|
||||
public:
|
||||
typedef typename DType::c_type T;
|
||||
|
||||
explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
|
||||
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
|
||||
: Scanner(std::move(reader), batch_size, pool) {
|
||||
typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
|
||||
int value_byte_size = type_traits<DType::type_num>::value_byte_size;
|
||||
PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
|
||||
values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
|
||||
}
|
||||
|
||||
virtual ~TypedScanner() {}
|
||||
|
||||
bool NextLevels(int16_t* def_level, int16_t* rep_level) {
|
||||
if (level_offset_ == levels_buffered_) {
|
||||
levels_buffered_ = static_cast<int>(
|
||||
typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
|
||||
rep_levels_.data(), values_, &values_buffered_));
|
||||
|
||||
value_offset_ = 0;
|
||||
level_offset_ = 0;
|
||||
if (!levels_buffered_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
*def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
|
||||
*rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
|
||||
level_offset_++;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
|
||||
if (level_offset_ == levels_buffered_) {
|
||||
if (!HasNext()) {
|
||||
// Out of data pages
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
NextLevels(def_level, rep_level);
|
||||
*is_null = *def_level < descr()->max_definition_level();
|
||||
|
||||
if (*is_null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (value_offset_ == values_buffered_) {
|
||||
throw ParquetException("Value was non-null, but has not been buffered");
|
||||
}
|
||||
*val = values_[value_offset_++];
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns true if there is a next value
|
||||
bool NextValue(T* val, bool* is_null) {
|
||||
if (level_offset_ == levels_buffered_) {
|
||||
if (!HasNext()) {
|
||||
// Out of data pages
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Out of values
|
||||
int16_t def_level = -1;
|
||||
int16_t rep_level = -1;
|
||||
NextLevels(&def_level, &rep_level);
|
||||
*is_null = def_level < descr()->max_definition_level();
|
||||
|
||||
if (*is_null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (value_offset_ == values_buffered_) {
|
||||
throw ParquetException("Value was non-null, but has not been buffered");
|
||||
}
|
||||
*val = values_[value_offset_++];
|
||||
return true;
|
||||
}
|
||||
|
||||
virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
|
||||
T val{};
|
||||
int16_t def_level = -1;
|
||||
int16_t rep_level = -1;
|
||||
bool is_null = false;
|
||||
char buffer[80];
|
||||
|
||||
if (!Next(&val, &def_level, &rep_level, &is_null)) {
|
||||
throw ParquetException("No more values buffered");
|
||||
}
|
||||
|
||||
if (with_levels) {
|
||||
out << " D:" << def_level << " R:" << rep_level << " ";
|
||||
if (!is_null) {
|
||||
out << "V:";
|
||||
}
|
||||
}
|
||||
|
||||
if (is_null) {
|
||||
std::string null_fmt = format_fwf<ByteArrayType>(width);
|
||||
snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
|
||||
} else {
|
||||
FormatValue(&val, buffer, sizeof(buffer), width);
|
||||
}
|
||||
out << buffer;
|
||||
}
|
||||
|
||||
private:
|
||||
// The ownership of this object is expressed through the reader_ variable in the base
|
||||
TypedColumnReader<DType>* typed_reader_;
|
||||
|
||||
inline void FormatValue(void* val, char* buffer, int bufsize, int width);
|
||||
|
||||
T* values_;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<DType>(width);
|
||||
snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<Int96Type>(width);
|
||||
std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
|
||||
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<ByteArrayType>(width);
|
||||
std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
|
||||
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
|
||||
int width) {
|
||||
std::string fmt = format_fwf<FLBAType>(width);
|
||||
std::string result = FixedLenByteArrayToString(
|
||||
*reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
|
||||
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
|
||||
}
|
||||
|
||||
typedef TypedScanner<BooleanType> BoolScanner;
|
||||
typedef TypedScanner<Int32Type> Int32Scanner;
|
||||
typedef TypedScanner<Int64Type> Int64Scanner;
|
||||
typedef TypedScanner<Int96Type> Int96Scanner;
|
||||
typedef TypedScanner<FloatType> FloatScanner;
|
||||
typedef TypedScanner<DoubleType> DoubleScanner;
|
||||
typedef TypedScanner<ByteArrayType> ByteArrayScanner;
|
||||
typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
|
||||
|
||||
template <typename RType>
|
||||
int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
|
||||
uint8_t* values, int64_t* values_buffered,
|
||||
parquet::ColumnReader* reader) {
|
||||
typedef typename RType::T Type;
|
||||
auto typed_reader = static_cast<RType*>(reader);
|
||||
auto vals = reinterpret_cast<Type*>(&values[0]);
|
||||
return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
|
||||
values_buffered);
|
||||
}
|
||||
|
||||
int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
|
||||
int16_t* rep_levels, uint8_t* values,
|
||||
int64_t* values_buffered,
|
||||
parquet::ColumnReader* reader);
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,304 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/compression.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
|
||||
namespace bit_util {
|
||||
class BitWriter;
|
||||
} // namespace bit_util
|
||||
|
||||
namespace util {
|
||||
class RleBitPackedEncoder;
|
||||
class CodecOptions;
|
||||
} // namespace util
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
struct ArrowWriteContext;
|
||||
class ColumnChunkMetaDataBuilder;
|
||||
class ColumnDescriptor;
|
||||
class ColumnIndexBuilder;
|
||||
class DataPage;
|
||||
class DictionaryPage;
|
||||
class Encryptor;
|
||||
class OffsetIndexBuilder;
|
||||
class WriterProperties;
|
||||
|
||||
class PARQUET_EXPORT LevelEncoder {
|
||||
public:
|
||||
LevelEncoder();
|
||||
~LevelEncoder();
|
||||
|
||||
static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
|
||||
int num_buffered_values);
|
||||
|
||||
// Initialize the LevelEncoder.
|
||||
void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
|
||||
uint8_t* data, int data_size);
|
||||
|
||||
// Encodes a batch of levels from an array and returns the number of levels encoded
|
||||
int Encode(int batch_size, const int16_t* levels);
|
||||
|
||||
int32_t len() {
|
||||
if (encoding_ != Encoding::RLE) {
|
||||
throw ParquetException("Only implemented for RLE encoding");
|
||||
}
|
||||
return rle_length_;
|
||||
}
|
||||
|
||||
private:
|
||||
int bit_width_;
|
||||
int rle_length_;
|
||||
Encoding::type encoding_;
|
||||
std::unique_ptr<::arrow::util::RleBitPackedEncoder> rle_encoder_;
|
||||
std::unique_ptr<::arrow::bit_util::BitWriter> bit_packed_encoder_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT PageWriter {
|
||||
public:
|
||||
virtual ~PageWriter() {}
|
||||
|
||||
static std::unique_ptr<PageWriter> Open(
|
||||
std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
|
||||
ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1,
|
||||
int16_t column_chunk_ordinal = -1,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
|
||||
bool buffered_row_group = false,
|
||||
std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
|
||||
std::shared_ptr<Encryptor> data_encryptor = NULLPTR,
|
||||
bool page_write_checksum_enabled = false,
|
||||
// column_index_builder MUST outlive the PageWriter
|
||||
ColumnIndexBuilder* column_index_builder = NULLPTR,
|
||||
// offset_index_builder MUST outlive the PageWriter
|
||||
OffsetIndexBuilder* offset_index_builder = NULLPTR,
|
||||
const CodecOptions& codec_options = CodecOptions{});
|
||||
|
||||
// The Column Writer decides if dictionary encoding is used if set and
|
||||
// if the dictionary encoding has fallen back to default encoding on reaching dictionary
|
||||
// page limit
|
||||
virtual void Close(bool has_dictionary, bool fallback) = 0;
|
||||
|
||||
// Return the number of uncompressed bytes written (including header size)
|
||||
virtual int64_t WriteDataPage(const DataPage& page) = 0;
|
||||
|
||||
// Return the number of uncompressed bytes written (including header size)
|
||||
virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
|
||||
|
||||
/// \brief The total number of bytes written as serialized data and
|
||||
/// dictionary pages to the sink so far.
|
||||
virtual int64_t total_compressed_bytes_written() const = 0;
|
||||
|
||||
virtual bool has_compressor() = 0;
|
||||
|
||||
virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnWriter {
|
||||
public:
|
||||
virtual ~ColumnWriter() = default;
|
||||
|
||||
static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
|
||||
std::unique_ptr<PageWriter>,
|
||||
const WriterProperties* properties);
|
||||
|
||||
/// \brief Closes the ColumnWriter, commits any buffered values to pages.
|
||||
/// \return Total size of the column in bytes
|
||||
virtual int64_t Close() = 0;
|
||||
|
||||
/// \brief The physical Parquet type of the column
|
||||
virtual Type::type type() const = 0;
|
||||
|
||||
/// \brief The schema for the column
|
||||
virtual const ColumnDescriptor* descr() const = 0;
|
||||
|
||||
/// \brief The number of rows written so far
|
||||
virtual int64_t rows_written() const = 0;
|
||||
|
||||
/// \brief The total size of the compressed pages + page headers. Values
|
||||
/// are still buffered and not written to a pager yet
|
||||
///
|
||||
/// So in un-buffered mode, it always returns 0
|
||||
virtual int64_t total_compressed_bytes() const = 0;
|
||||
|
||||
/// \brief The total number of bytes written as serialized data and
|
||||
/// dictionary pages to the ColumnChunk so far
|
||||
/// These bytes are uncompressed bytes.
|
||||
virtual int64_t total_bytes_written() const = 0;
|
||||
|
||||
/// \brief The total number of bytes written as serialized data and
|
||||
/// dictionary pages to the ColumnChunk so far.
|
||||
/// If the column is uncompressed, the value would be equal to
|
||||
/// total_bytes_written().
|
||||
virtual int64_t total_compressed_bytes_written() const = 0;
|
||||
|
||||
/// \brief Estimated size of the values that are not written to a page yet.
|
||||
virtual int64_t estimated_buffered_value_bytes() const = 0;
|
||||
|
||||
/// \brief The file-level writer properties
|
||||
virtual const WriterProperties* properties() = 0;
|
||||
|
||||
/// \brief Add key-value metadata to the ColumnChunk.
|
||||
/// \param[in] key_value_metadata the metadata to add.
|
||||
/// \note This will overwrite any existing metadata with the same key.
|
||||
/// \throw ParquetException if Close() has been called.
|
||||
virtual void AddKeyValueMetadata(
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
|
||||
|
||||
/// \brief Reset the ColumnChunk key-value metadata.
|
||||
/// \throw ParquetException if Close() has been called.
|
||||
virtual void ResetKeyValueMetadata() = 0;
|
||||
|
||||
/// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
|
||||
/// error status if the array data type is not compatible with the concrete
|
||||
/// writer type.
|
||||
///
|
||||
/// leaf_array is always a primitive (possibly dictionary encoded type).
|
||||
/// Leaf_field_nullable indicates whether the leaf array is considered nullable
|
||||
/// according to its schema in a Table or its parent array.
|
||||
virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
|
||||
int64_t num_levels, const ::arrow::Array& leaf_array,
|
||||
ArrowWriteContext* ctx,
|
||||
bool leaf_field_nullable) = 0;
|
||||
};
|
||||
|
||||
// API to write values to a single column. This is the main client facing API.
|
||||
template <typename DType>
|
||||
class TypedColumnWriter : public ColumnWriter {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
// Write a batch of repetition levels, definition levels, and values to the
|
||||
// column.
|
||||
// `num_values` is the number of logical leaf values.
|
||||
// `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
|
||||
// (resp. max repetition level) is 0.
|
||||
// If not null, each of `def_levels` and `rep_levels` must have at least
|
||||
// `num_values`.
|
||||
//
|
||||
// The number of physical values written (taken from `values`) is returned.
|
||||
// It can be smaller than `num_values` is there are some undefined values.
|
||||
virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
|
||||
const int16_t* rep_levels, const T* values) = 0;
|
||||
|
||||
/// Write a batch of repetition levels, definition levels, and values to the
|
||||
/// column.
|
||||
///
|
||||
/// In comparison to WriteBatch the length of repetition and definition levels
|
||||
/// is the same as of the number of values read for max_definition_level == 1.
|
||||
/// In the case of max_definition_level > 1, the repetition and definition
|
||||
/// levels are larger than the values but the values include the null entries
|
||||
/// with definition_level == (max_definition_level - 1). Thus we have to differentiate
|
||||
/// in the parameters of this function if the input has the length of num_values or the
|
||||
/// _number of rows in the lowest nesting level_.
|
||||
///
|
||||
/// In the case that the most inner node in the Parquet is required, the _number of rows
|
||||
/// in the lowest nesting level_ is equal to the number of non-null values. If the
|
||||
/// inner-most schema node is optional, the _number of rows in the lowest nesting level_
|
||||
/// also includes all values with definition_level == (max_definition_level - 1).
|
||||
///
|
||||
/// @param num_values number of levels to write.
|
||||
/// @param def_levels The Parquet definition levels, length is num_values
|
||||
/// @param rep_levels The Parquet repetition levels, length is num_values
|
||||
/// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
|
||||
/// level. The length is number of rows in the lowest nesting level.
|
||||
/// @param valid_bits_offset The offset in bits of the valid_bits where the
|
||||
/// first relevant bit resides.
|
||||
/// @param values The values in the lowest nested level including
|
||||
/// spacing for nulls on the lowest levels; input has the length
|
||||
/// of the number of rows on the lowest nesting level.
|
||||
virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
|
||||
const int16_t* rep_levels, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset, const T* values) = 0;
|
||||
};
|
||||
|
||||
using BoolWriter = TypedColumnWriter<BooleanType>;
|
||||
using Int32Writer = TypedColumnWriter<Int32Type>;
|
||||
using Int64Writer = TypedColumnWriter<Int64Type>;
|
||||
using Int96Writer = TypedColumnWriter<Int96Type>;
|
||||
using FloatWriter = TypedColumnWriter<FloatType>;
|
||||
using DoubleWriter = TypedColumnWriter<DoubleType>;
|
||||
using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
|
||||
using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
|
||||
|
||||
namespace internal {
|
||||
|
||||
/**
|
||||
* Timestamp conversion constants
|
||||
*/
|
||||
constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
|
||||
|
||||
template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
|
||||
inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
|
||||
int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
|
||||
(*impala_timestamp).value[2] = (uint32_t)julian_days;
|
||||
|
||||
int64_t last_day_units = time % UnitPerDay;
|
||||
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
|
||||
// impala_timestamp will be unaligned every other entry so do memcpy instead
|
||||
// of assign and reinterpret cast to avoid undefined behavior.
|
||||
std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
|
||||
}
|
||||
|
||||
constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
|
||||
|
||||
inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
|
||||
impala_timestamp);
|
||||
}
|
||||
|
||||
constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
|
||||
|
||||
inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
|
||||
Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
|
||||
milliseconds, impala_timestamp);
|
||||
}
|
||||
|
||||
constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
|
||||
|
||||
inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
|
||||
Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
|
||||
microseconds, impala_timestamp);
|
||||
}
|
||||
|
||||
constexpr int64_t kNanosecondsInNanos = INT64_C(1);
|
||||
|
||||
inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
|
||||
Int96* impala_timestamp) {
|
||||
ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
|
||||
nanoseconds, impala_timestamp);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,458 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
template <typename T>
|
||||
class Dictionary32Builder;
|
||||
}
|
||||
|
||||
namespace parquet {
|
||||
|
||||
template <typename DType>
|
||||
class TypedEncoder;
|
||||
|
||||
using BooleanEncoder = TypedEncoder<BooleanType>;
|
||||
using Int32Encoder = TypedEncoder<Int32Type>;
|
||||
using Int64Encoder = TypedEncoder<Int64Type>;
|
||||
using Int96Encoder = TypedEncoder<Int96Type>;
|
||||
using FloatEncoder = TypedEncoder<FloatType>;
|
||||
using DoubleEncoder = TypedEncoder<DoubleType>;
|
||||
using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
|
||||
using FLBAEncoder = TypedEncoder<FLBAType>;
|
||||
|
||||
template <typename DType>
|
||||
class TypedDecoder;
|
||||
|
||||
class BooleanDecoder;
|
||||
using Int32Decoder = TypedDecoder<Int32Type>;
|
||||
using Int64Decoder = TypedDecoder<Int64Type>;
|
||||
using Int96Decoder = TypedDecoder<Int96Type>;
|
||||
using FloatDecoder = TypedDecoder<FloatType>;
|
||||
using DoubleDecoder = TypedDecoder<DoubleType>;
|
||||
using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
|
||||
class FLBADecoder;
|
||||
|
||||
template <typename T>
|
||||
struct EncodingTraits;
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<BooleanType> {
|
||||
using Encoder = BooleanEncoder;
|
||||
using Decoder = BooleanDecoder;
|
||||
|
||||
using ArrowType = ::arrow::BooleanType;
|
||||
using Accumulator = ::arrow::BooleanBuilder;
|
||||
struct DictAccumulator {};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<Int32Type> {
|
||||
using Encoder = Int32Encoder;
|
||||
using Decoder = Int32Decoder;
|
||||
|
||||
using ArrowType = ::arrow::Int32Type;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<Int64Type> {
|
||||
using Encoder = Int64Encoder;
|
||||
using Decoder = Int64Decoder;
|
||||
|
||||
using ArrowType = ::arrow::Int64Type;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<Int96Type> {
|
||||
using Encoder = Int96Encoder;
|
||||
using Decoder = Int96Decoder;
|
||||
|
||||
struct Accumulator {};
|
||||
struct DictAccumulator {};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<FloatType> {
|
||||
using Encoder = FloatEncoder;
|
||||
using Decoder = FloatDecoder;
|
||||
|
||||
using ArrowType = ::arrow::FloatType;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<DoubleType> {
|
||||
using Encoder = DoubleEncoder;
|
||||
using Decoder = DoubleDecoder;
|
||||
|
||||
using ArrowType = ::arrow::DoubleType;
|
||||
using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<ByteArrayType> {
|
||||
using Encoder = ByteArrayEncoder;
|
||||
using Decoder = ByteArrayDecoder;
|
||||
|
||||
/// \brief Internal helper class for decoding BYTE_ARRAY data
|
||||
///
|
||||
/// This class allows the caller to choose the concrete Arrow data type
|
||||
/// by passing a corresponding `ArrayBuilder`.
|
||||
/// Supported `ArrayBuilder` classes are `BinaryBuilder`, `LargeBinaryBuilder`
|
||||
/// and `BinaryViewBuilder`.
|
||||
/// If the builder is a `BinaryBuilder`, `chunks` can accumulate several
|
||||
/// arrays as needed to work around the 32-bit offset limit.
|
||||
struct Accumulator {
|
||||
std::unique_ptr<::arrow::ArrayBuilder> builder;
|
||||
std::vector<std::shared_ptr<::arrow::Array>> chunks;
|
||||
};
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct EncodingTraits<FLBAType> {
|
||||
using Encoder = FLBAEncoder;
|
||||
using Decoder = FLBADecoder;
|
||||
|
||||
using ArrowType = ::arrow::FixedSizeBinaryType;
|
||||
using Accumulator = ::arrow::FixedSizeBinaryBuilder;
|
||||
using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
|
||||
};
|
||||
|
||||
class ColumnDescriptor;
|
||||
|
||||
// Untyped base for all encoders
|
||||
class Encoder {
|
||||
public:
|
||||
virtual ~Encoder() = default;
|
||||
|
||||
virtual int64_t EstimatedDataEncodedSize() = 0;
|
||||
virtual std::shared_ptr<Buffer> FlushValues() = 0;
|
||||
virtual Encoding::type encoding() const = 0;
|
||||
|
||||
virtual void Put(const ::arrow::Array& values) = 0;
|
||||
|
||||
// Report the number of bytes written to the encoder since the last report.
|
||||
// It only works for BYTE_ARRAY type and throw for other types.
|
||||
// This call is not idempotent since it resets the internal counter.
|
||||
virtual int64_t ReportUnencodedDataBytes() = 0;
|
||||
|
||||
virtual MemoryPool* memory_pool() const = 0;
|
||||
};
|
||||
|
||||
// Base class for value encoders. Since encoders may or not have state (e.g.,
|
||||
// dictionary encoding) we use a class instance to maintain any state.
|
||||
//
|
||||
// Encode interfaces are internal, subject to change without deprecation.
|
||||
template <typename DType>
|
||||
class TypedEncoder : virtual public Encoder {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
using Encoder::Put;
|
||||
|
||||
virtual void Put(const T* src, int num_values) = 0;
|
||||
|
||||
virtual void Put(const std::vector<T>& src, int num_values = -1);
|
||||
|
||||
virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset) = 0;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
|
||||
if (num_values == -1) {
|
||||
num_values = static_cast<int>(src.size());
|
||||
}
|
||||
Put(src.data(), num_values);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
|
||||
// NOTE(wesm): This stub is here only to satisfy the compiler; it is
|
||||
// overridden later with the actual implementation
|
||||
}
|
||||
|
||||
// Base class for dictionary encoders
|
||||
template <typename DType>
|
||||
class DictEncoder : virtual public TypedEncoder<DType> {
|
||||
public:
|
||||
/// Writes out any buffered indices to buffer preceded by the bit width of this data.
|
||||
/// Returns the number of bytes written.
|
||||
/// If the supplied buffer is not big enough, returns -1.
|
||||
/// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
|
||||
/// to size buffer.
|
||||
virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
|
||||
|
||||
virtual int dict_encoded_size() const = 0;
|
||||
|
||||
virtual int bit_width() const = 0;
|
||||
|
||||
/// Writes out the encoded dictionary to buffer. buffer must be preallocated to
|
||||
/// dict_encoded_size() bytes.
|
||||
virtual void WriteDict(uint8_t* buffer) const = 0;
|
||||
|
||||
virtual int num_entries() const = 0;
|
||||
|
||||
/// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
|
||||
/// assumed (without any boundschecking) that the indices reference
|
||||
/// preexisting dictionary values
|
||||
/// \param[in] indices the dictionary index values. Only Int32Array currently
|
||||
/// supported
|
||||
virtual void PutIndices(const ::arrow::Array& indices) = 0;
|
||||
|
||||
/// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
|
||||
/// separately. Currently throws exception if the current dictionary memo is
|
||||
/// non-empty
|
||||
/// \param[in] values the dictionary values. Only valid for certain
|
||||
/// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
|
||||
virtual void PutDictionary(const ::arrow::Array& values) = 0;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Value decoding
|
||||
|
||||
class Decoder {
|
||||
public:
|
||||
virtual ~Decoder() = default;
|
||||
|
||||
// Sets the data for a new page. This will be called multiple times on the same
|
||||
// decoder and should reset all internal state.
|
||||
//
|
||||
// `num_values` comes from the data page header, and may be greater than the number of
|
||||
// physical values in the data buffer if there are some omitted (null) values.
|
||||
// `len`, on the other hand, is the size in bytes of the data buffer and
|
||||
// directly relates to the number of physical values.
|
||||
virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
|
||||
|
||||
// Returns the number of values left (for the last call to SetData()). This is
|
||||
// the number of values left in this page.
|
||||
virtual int values_left() const = 0;
|
||||
virtual Encoding::type encoding() const = 0;
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
class TypedDecoder : virtual public Decoder {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
/// \brief Decode values into a buffer
|
||||
///
|
||||
/// Subclasses may override the more specialized Decode methods below.
|
||||
///
|
||||
/// \param[in] buffer destination for decoded values
|
||||
/// \param[in] max_values maximum number of values to decode
|
||||
/// \return The number of values decoded. Should be identical to max_values except
|
||||
/// at the end of the current data page.
|
||||
virtual int Decode(T* buffer, int max_values) = 0;
|
||||
|
||||
/// \brief Decode the values in this data page but leave spaces for null entries.
|
||||
///
|
||||
/// \param[in] buffer destination for decoded values
|
||||
/// \param[in] num_values size of the def_levels and buffer arrays including the number
|
||||
/// of null slots
|
||||
/// \param[in] null_count number of null slots
|
||||
/// \param[in] valid_bits bitmap data indicating position of valid slots
|
||||
/// \param[in] valid_bits_offset offset into valid_bits
|
||||
/// \return The number of values decoded, including nulls.
|
||||
virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
|
||||
const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;
|
||||
|
||||
/// \brief Decode into an ArrayBuilder or other accumulator
|
||||
///
|
||||
/// This function assumes the definition levels were already decoded
|
||||
/// as a validity bitmap in the given `valid_bits`. `null_count`
|
||||
/// is the number of 0s in `valid_bits`.
|
||||
/// As a space optimization, it is allowed for `valid_bits` to be null
|
||||
/// if `null_count` is zero.
|
||||
///
|
||||
/// \return number of values decoded
|
||||
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset,
|
||||
typename EncodingTraits<DType>::Accumulator* out) = 0;
|
||||
|
||||
/// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
|
||||
///
|
||||
/// \return number of values decoded
|
||||
int DecodeArrowNonNull(int num_values,
|
||||
typename EncodingTraits<DType>::Accumulator* out) {
|
||||
return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
|
||||
}
|
||||
|
||||
/// \brief Decode into a DictionaryBuilder
|
||||
///
|
||||
/// This function assumes the definition levels were already decoded
|
||||
/// as a validity bitmap in the given `valid_bits`. `null_count`
|
||||
/// is the number of 0s in `valid_bits`.
|
||||
/// As a space optimization, it is allowed for `valid_bits` to be null
|
||||
/// if `null_count` is zero.
|
||||
///
|
||||
/// \return number of values decoded
|
||||
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset,
|
||||
typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
|
||||
|
||||
/// \brief Decode into a DictionaryBuilder ignoring nulls
|
||||
///
|
||||
/// \return number of values decoded
|
||||
int DecodeArrowNonNull(int num_values,
|
||||
typename EncodingTraits<DType>::DictAccumulator* builder) {
|
||||
return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DType>
|
||||
class DictDecoder : virtual public TypedDecoder<DType> {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
|
||||
|
||||
/// \brief Insert dictionary values into the Arrow dictionary builder's memo,
|
||||
/// but do not append any indices
|
||||
virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
|
||||
|
||||
/// \brief Decode only dictionary indices and append to dictionary
|
||||
/// builder. The builder must have had the dictionary from this decoder
|
||||
/// inserted already.
|
||||
///
|
||||
/// \warning Remember to reset the builder each time the dict decoder is initialized
|
||||
/// with a new dictionary page
|
||||
virtual int DecodeIndicesSpaced(int num_values, int null_count,
|
||||
const uint8_t* valid_bits, int64_t valid_bits_offset,
|
||||
::arrow::ArrayBuilder* builder) = 0;
|
||||
|
||||
/// \brief Decode only dictionary indices (no nulls)
|
||||
///
|
||||
/// \warning Remember to reset the builder each time the dict decoder is initialized
|
||||
/// with a new dictionary page
|
||||
virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
|
||||
|
||||
/// \brief Decode only dictionary indices (no nulls). Same as above
|
||||
/// DecodeIndices but target is an array instead of a builder.
|
||||
///
|
||||
/// \note API EXPERIMENTAL
|
||||
virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
|
||||
|
||||
/// \brief Get dictionary. The reader will call this API when it encounters a
|
||||
/// new dictionary.
|
||||
///
|
||||
/// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
|
||||
/// the decoder and is destroyed when the decoder is destroyed.
|
||||
/// @param[out] dictionary_length The dictionary length.
|
||||
///
|
||||
/// \note API EXPERIMENTAL
|
||||
virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// TypedEncoder specializations, traits, and factory functions
|
||||
|
||||
class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
|
||||
public:
|
||||
using TypedDecoder<BooleanType>::Decode;
|
||||
|
||||
/// \brief Decode and bit-pack values into a buffer
|
||||
///
|
||||
/// \param[in] buffer destination for decoded values
|
||||
/// This buffer will contain bit-packed values. If
|
||||
/// max_values is not a multiple of 8, the trailing bits
|
||||
/// of the last byte will be undefined.
|
||||
/// \param[in] max_values max values to decode.
|
||||
/// \return The number of values decoded. Should be identical to max_values except
|
||||
/// at the end of the current data page.
|
||||
virtual int Decode(uint8_t* buffer, int max_values) = 0;
|
||||
};
|
||||
|
||||
class FLBADecoder : virtual public TypedDecoder<FLBAType> {
|
||||
public:
|
||||
using TypedDecoder<FLBAType>::DecodeSpaced;
|
||||
|
||||
// TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
|
||||
// there is value in adding specialized read methods for
|
||||
// FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
|
||||
// then perhaps not
|
||||
};
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Encoder> MakeEncoder(
|
||||
Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
|
||||
const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
template <typename DType>
|
||||
std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
|
||||
Encoding::type encoding, bool use_dictionary = false,
|
||||
const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
using OutType = typename EncodingTraits<DType>::Encoder;
|
||||
std::unique_ptr<Encoder> base =
|
||||
MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
|
||||
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
|
||||
}
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Decoder> MakeDecoder(
|
||||
Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
namespace detail {
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
|
||||
const ColumnDescriptor* descr,
|
||||
::arrow::MemoryPool* pool);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename DType>
|
||||
std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
|
||||
const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
using OutType = DictDecoder<DType>;
|
||||
auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
|
||||
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
|
||||
}
|
||||
|
||||
template <typename DType>
|
||||
std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
|
||||
Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
using OutType = typename EncodingTraits<DType>::Decoder;
|
||||
std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool);
|
||||
return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
|
||||
}
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,152 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "parquet/encryption/encryption.h"
|
||||
#include "parquet/encryption/file_key_wrapper.h"
|
||||
#include "parquet/encryption/key_toolkit.h"
|
||||
#include "parquet/encryption/kms_client_factory.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
|
||||
ParquetCipher::AES_GCM_V1;
|
||||
static constexpr bool kDefaultPlaintextFooter = false;
|
||||
static constexpr bool kDefaultDoubleWrapping = true;
|
||||
static constexpr double kDefaultCacheLifetimeSeconds = 600; // 10 minutes
|
||||
static constexpr bool kDefaultInternalKeyMaterial = true;
|
||||
static constexpr bool kDefaultUniformEncryption = false;
|
||||
static constexpr int32_t kDefaultDataKeyLengthBits = 128;
|
||||
|
||||
struct PARQUET_EXPORT EncryptionConfiguration {
|
||||
explicit EncryptionConfiguration(const std::string& footer_key)
|
||||
: footer_key(footer_key) {}
|
||||
|
||||
/// ID of the master key for footer encryption/signing
|
||||
std::string footer_key;
|
||||
|
||||
/// List of columns to encrypt, with column master key IDs (see HIVE-21848).
|
||||
/// Format: "columnKeyID:colName,colName;columnKeyID:colName..."
|
||||
/// Either
|
||||
/// (1) column_keys must be set
|
||||
/// or
|
||||
/// (2) uniform_encryption must be set to true
|
||||
/// If none of (1) and (2) are true, or if both are true, an exception will be
|
||||
/// thrown.
|
||||
std::string column_keys;
|
||||
|
||||
/// Encrypt footer and all columns with the same encryption key.
|
||||
bool uniform_encryption = kDefaultUniformEncryption;
|
||||
|
||||
/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1".
|
||||
ParquetCipher::type encryption_algorithm = kDefaultEncryptionAlgorithm;
|
||||
|
||||
/// Write files with plaintext footer.
|
||||
/// The default is false - files are written with encrypted footer.
|
||||
bool plaintext_footer = kDefaultPlaintextFooter;
|
||||
|
||||
/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key
|
||||
/// encryption keys (KEKs), which in turn are encrypted with master keys.
|
||||
/// The default is true. If set to false, use single wrapping - where DEKs are
|
||||
/// encrypted directly with master keys.
|
||||
bool double_wrapping = kDefaultDoubleWrapping;
|
||||
|
||||
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
|
||||
/// objects).
|
||||
/// The default is 600 (10 minutes).
|
||||
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
|
||||
|
||||
/// Store key material inside Parquet file footers; this mode doesn’t produce
|
||||
/// additional files. By default, true. If set to false, key material is stored in
|
||||
/// separate files in the same folder, which enables key rotation for immutable
|
||||
/// Parquet files.
|
||||
bool internal_key_material = kDefaultInternalKeyMaterial;
|
||||
|
||||
/// Length of data encryption keys (DEKs), randomly generated by parquet key
|
||||
/// management tools. Can be 128, 192 or 256 bits.
|
||||
/// The default is 128 bits.
|
||||
int32_t data_key_length_bits = kDefaultDataKeyLengthBits;
|
||||
};
|
||||
|
||||
struct PARQUET_EXPORT DecryptionConfiguration {
|
||||
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
|
||||
/// objects).
|
||||
/// The default is 600 (10 minutes).
|
||||
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
|
||||
};
|
||||
|
||||
/// This is a core class, that translates the parameters of high level encryption (like
|
||||
/// the names of encrypted columns, names of master keys, etc), into parameters of low
|
||||
/// level encryption (like the key metadata, DEK, etc). A factory that produces the low
|
||||
/// level FileEncryptionProperties and FileDecryptionProperties objects, from the high
|
||||
/// level parameters.
|
||||
class PARQUET_EXPORT CryptoFactory {
|
||||
public:
|
||||
/// a KmsClientFactory object must be registered via this method before calling any of
|
||||
/// GetFileEncryptionProperties()/GetFileDecryptionProperties() methods.
|
||||
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory);
|
||||
|
||||
/// Get the encryption properties for a Parquet file.
|
||||
/// If external key material is used then a file system and path to the
|
||||
/// parquet file must be provided.
|
||||
std::shared_ptr<FileEncryptionProperties> GetFileEncryptionProperties(
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
const EncryptionConfiguration& encryption_config, const std::string& file_path = "",
|
||||
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
|
||||
|
||||
/// Get decryption properties for a Parquet file.
|
||||
/// If external key material is used then a file system and path to the
|
||||
/// parquet file must be provided.
|
||||
std::shared_ptr<FileDecryptionProperties> GetFileDecryptionProperties(
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
const DecryptionConfiguration& decryption_config, const std::string& file_path = "",
|
||||
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
|
||||
|
||||
void RemoveCacheEntriesForToken(const std::string& access_token) {
|
||||
key_toolkit_->RemoveCacheEntriesForToken(access_token);
|
||||
}
|
||||
|
||||
void RemoveCacheEntriesForAllTokens() {
|
||||
key_toolkit_->RemoveCacheEntriesForAllTokens();
|
||||
}
|
||||
|
||||
/// Rotates master encryption keys for a Parquet file that uses external key material.
|
||||
/// In single wrapping mode, data encryption keys are decrypted with the old master keys
|
||||
/// and then re-encrypted with new master keys.
|
||||
/// In double wrapping mode, key encryption keys are decrypted with the old master keys
|
||||
/// and then re-encrypted with new master keys.
|
||||
/// This relies on the KMS supporting versioning, such that the old master key is
|
||||
/// used when unwrapping a key, and the latest version is used when wrapping a key.
|
||||
void RotateMasterKeys(const KmsConnectionConfig& kms_connection_config,
|
||||
const std::string& parquet_file_path,
|
||||
const std::shared_ptr<::arrow::fs::FileSystem>& file_system,
|
||||
bool double_wrapping = kDefaultDoubleWrapping,
|
||||
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds);
|
||||
|
||||
private:
|
||||
ColumnPathToEncryptionPropertiesMap GetColumnEncryptionProperties(
|
||||
int dek_length, const std::string& column_keys, FileKeyWrapper* key_wrapper);
|
||||
|
||||
/// Key utilities object for kms client initialization and cache control
|
||||
std::shared_ptr<KeyToolkit> key_toolkit_ = std::make_shared<KeyToolkit>();
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,441 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/util/secure_string.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
|
||||
ParquetCipher::AES_GCM_V1;
|
||||
static constexpr int32_t kMaximalAadMetadataLength = 256;
|
||||
static constexpr bool kDefaultEncryptedFooter = true;
|
||||
static constexpr bool kDefaultCheckSignature = true;
|
||||
static constexpr bool kDefaultAllowPlaintextFiles = false;
|
||||
static constexpr int32_t kAadFileUniqueLength = 8;
|
||||
|
||||
class ColumnDecryptionProperties;
|
||||
using ColumnPathToDecryptionPropertiesMap =
|
||||
std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
|
||||
|
||||
class ColumnEncryptionProperties;
|
||||
using ColumnPathToEncryptionPropertiesMap =
|
||||
std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
|
||||
|
||||
class PARQUET_EXPORT DecryptionKeyRetriever {
|
||||
public:
|
||||
/// \brief Retrieve a key.
|
||||
virtual ::arrow::util::SecureString GetKey(const std::string& key_id) = 0;
|
||||
|
||||
virtual ~DecryptionKeyRetriever() {}
|
||||
};
|
||||
|
||||
/// Simple integer key retriever
|
||||
class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
|
||||
public:
|
||||
void PutKey(uint32_t key_id, ::arrow::util::SecureString key);
|
||||
|
||||
::arrow::util::SecureString GetKey(const std::string& key_id_string) override {
|
||||
// key_id_string is string but for IntegerKeyIdRetriever it encodes
|
||||
// a native-endian 32 bit unsigned integer key_id
|
||||
uint32_t key_id;
|
||||
assert(key_id_string.size() == sizeof(key_id));
|
||||
memcpy(&key_id, key_id_string.data(), sizeof(key_id));
|
||||
|
||||
return key_map_.at(key_id);
|
||||
}
|
||||
|
||||
private:
|
||||
std::map<uint32_t, ::arrow::util::SecureString> key_map_;
|
||||
};
|
||||
|
||||
// Simple string key retriever
|
||||
class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
|
||||
public:
|
||||
void PutKey(std::string key_id, ::arrow::util::SecureString key);
|
||||
::arrow::util::SecureString GetKey(const std::string& key_id) override;
|
||||
|
||||
private:
|
||||
std::map<std::string, ::arrow::util::SecureString> key_map_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT HiddenColumnException : public ParquetException {
|
||||
public:
|
||||
explicit HiddenColumnException(const std::string& columnPath)
|
||||
: ParquetException(columnPath.c_str()) {}
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
|
||||
public:
|
||||
explicit KeyAccessDeniedException(const std::string& columnPath)
|
||||
: ParquetException(columnPath.c_str()) {}
|
||||
};
|
||||
|
||||
inline const uint8_t* str2bytes(const std::string& str) {
|
||||
if (str.empty()) return NULLPTR;
|
||||
|
||||
char* cbytes = const_cast<char*>(str.c_str());
|
||||
return reinterpret_cast<const uint8_t*>(cbytes);
|
||||
}
|
||||
|
||||
inline ::arrow::util::span<const uint8_t> str2span(const std::string& str) {
|
||||
if (str.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
return {reinterpret_cast<const uint8_t*>(str.data()), str.size()};
|
||||
}
|
||||
|
||||
class PARQUET_EXPORT ColumnEncryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
/// Convenience builder for encrypted columns.
|
||||
explicit Builder(std::string name) : Builder(std::move(name), true) {}
|
||||
|
||||
/// Convenience builder for encrypted columns.
|
||||
explicit Builder(const schema::ColumnPath& path)
|
||||
: Builder(path.ToDotString(), true) {}
|
||||
|
||||
/// Set a column-specific key.
|
||||
/// If key is not set on an encrypted column, the column will
|
||||
/// be encrypted with the footer key.
|
||||
/// keyBytes Key length must be either 16, 24 or 32 bytes.
|
||||
/// Caller is responsible for wiping out the input key array.
|
||||
Builder* key(::arrow::util::SecureString column_key);
|
||||
|
||||
/// Set a key retrieval metadata.
|
||||
/// use either key_metadata() or key_id(), not both
|
||||
Builder* key_metadata(std::string key_metadata);
|
||||
|
||||
/// A convenience function to set key metadata using a string id.
|
||||
/// Set a key retrieval metadata (converted from String).
|
||||
/// use either key_metadata() or key_id(), not both
|
||||
/// key_id will be converted to metadata (UTF-8 array).
|
||||
Builder* key_id(std::string key_id);
|
||||
|
||||
std::shared_ptr<ColumnEncryptionProperties> build() {
|
||||
return std::shared_ptr<ColumnEncryptionProperties>(
|
||||
new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
|
||||
}
|
||||
|
||||
private:
|
||||
std::string column_path_;
|
||||
bool encrypted_;
|
||||
::arrow::util::SecureString key_;
|
||||
std::string key_metadata_;
|
||||
|
||||
Builder(std::string path, bool encrypted)
|
||||
: column_path_(std::move(path)), encrypted_(encrypted) {}
|
||||
};
|
||||
|
||||
const std::string& column_path() const { return column_path_; }
|
||||
bool is_encrypted() const { return encrypted_; }
|
||||
bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
|
||||
const ::arrow::util::SecureString& key() const { return key_; }
|
||||
const std::string& key_metadata() const { return key_metadata_; }
|
||||
|
||||
private:
|
||||
std::string column_path_;
|
||||
bool encrypted_;
|
||||
bool encrypted_with_footer_key_;
|
||||
::arrow::util::SecureString key_;
|
||||
std::string key_metadata_;
|
||||
explicit ColumnEncryptionProperties(bool encrypted, std::string column_path,
|
||||
::arrow::util::SecureString key,
|
||||
std::string key_metadata);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnDecryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
explicit Builder(std::string name) : column_path_(std::move(name)) {}
|
||||
|
||||
explicit Builder(const schema::ColumnPath& path) : Builder(path.ToDotString()) {}
|
||||
|
||||
/// Set an explicit column key. If applied on a file that contains
|
||||
/// key metadata for this column the metadata will be ignored,
|
||||
/// the column will be decrypted with this key.
|
||||
/// key length must be either 16, 24 or 32 bytes.
|
||||
Builder* key(::arrow::util::SecureString key);
|
||||
|
||||
std::shared_ptr<ColumnDecryptionProperties> build();
|
||||
|
||||
private:
|
||||
std::string column_path_;
|
||||
::arrow::util::SecureString key_;
|
||||
};
|
||||
|
||||
const std::string& column_path() const { return column_path_; }
|
||||
const ::arrow::util::SecureString& key() const { return key_; }
|
||||
|
||||
private:
|
||||
std::string column_path_;
|
||||
::arrow::util::SecureString key_;
|
||||
|
||||
/// This class is only required for setting explicit column decryption keys -
|
||||
/// to override key retriever (or to provide keys when key metadata and/or
|
||||
/// key retriever are not available)
|
||||
explicit ColumnDecryptionProperties(std::string column_path,
|
||||
::arrow::util::SecureString key);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT AADPrefixVerifier {
|
||||
public:
|
||||
/// Verifies identity (AAD Prefix) of individual file,
|
||||
/// or of file collection in a data set.
|
||||
/// Throws exception if an AAD prefix is wrong.
|
||||
/// In a data set, AAD Prefixes should be collected,
|
||||
/// and then checked for missing files.
|
||||
virtual void Verify(const std::string& aad_prefix) = 0;
|
||||
virtual ~AADPrefixVerifier() {}
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileDecryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
Builder() {
|
||||
check_plaintext_footer_integrity_ = kDefaultCheckSignature;
|
||||
plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
|
||||
}
|
||||
|
||||
/// Set an explicit footer key. If applied on a file that contains
|
||||
/// footer key metadata the metadata will be ignored, the footer
|
||||
/// will be decrypted/verified with this key.
|
||||
/// If explicit key is not set, footer key will be fetched from
|
||||
/// key retriever.
|
||||
/// With explicit keys or AAD prefix, new encryption properties object must be
|
||||
/// created for each encrypted file.
|
||||
/// Explicit encryption keys (footer and column) are cloned.
|
||||
/// Upon completion of file reading, the cloned encryption keys in the properties
|
||||
/// will be wiped out (array values set to 0).
|
||||
/// Caller is responsible for wiping out the input key array.
|
||||
/// param footerKey Key length must be either 16, 24 or 32 bytes.
|
||||
Builder* footer_key(::arrow::util::SecureString footer_key);
|
||||
|
||||
/// Set explicit column keys (decryption properties).
|
||||
/// Its also possible to set a key retriever on this property object.
|
||||
/// Upon file decryption, availability of explicit keys is checked before
|
||||
/// invocation of the retriever callback.
|
||||
/// If an explicit key is available for a footer or a column,
|
||||
/// its key metadata will be ignored.
|
||||
Builder* column_keys(
|
||||
ColumnPathToDecryptionPropertiesMap column_decryption_properties);
|
||||
|
||||
/// Set a key retriever callback. Its also possible to
|
||||
/// set explicit footer or column keys on this file property object.
|
||||
/// Upon file decryption, availability of explicit keys is checked before
|
||||
/// invocation of the retriever callback.
|
||||
/// If an explicit key is available for a footer or a column,
|
||||
/// its key metadata will be ignored.
|
||||
Builder* key_retriever(std::shared_ptr<DecryptionKeyRetriever> key_retriever);
|
||||
|
||||
/// Skip integrity verification of plaintext footers.
|
||||
/// If not called, integrity of plaintext footers will be checked in runtime,
|
||||
/// and an exception will be thrown in the following situations:
|
||||
/// - footer signing key is not available
|
||||
/// (not passed, or not found by key retriever)
|
||||
/// - footer content and signature don't match
|
||||
Builder* disable_footer_signature_verification() {
|
||||
check_plaintext_footer_integrity_ = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Explicitly supply the file AAD prefix.
|
||||
/// A must when a prefix is used for file encryption, but not stored in file.
|
||||
/// If AAD prefix is stored in file, it will be compared to the explicitly
|
||||
/// supplied value and an exception will be thrown if they differ.
|
||||
Builder* aad_prefix(std::string aad_prefix);
|
||||
|
||||
/// Set callback for verification of AAD Prefixes stored in file.
|
||||
Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
|
||||
|
||||
/// By default, reading plaintext (unencrypted) files is not
|
||||
/// allowed when using a decryptor
|
||||
/// - in order to detect files that were not encrypted by mistake.
|
||||
/// However, the default behavior can be overridden by calling this method.
|
||||
/// The caller should use then a different method to ensure encryption
|
||||
/// of files with sensitive data.
|
||||
Builder* plaintext_files_allowed() {
|
||||
plaintext_files_allowed_ = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
std::shared_ptr<FileDecryptionProperties> build() {
|
||||
return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
|
||||
footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
|
||||
aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
|
||||
}
|
||||
|
||||
private:
|
||||
::arrow::util::SecureString footer_key_;
|
||||
std::string aad_prefix_;
|
||||
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
|
||||
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
|
||||
|
||||
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
|
||||
bool check_plaintext_footer_integrity_;
|
||||
bool plaintext_files_allowed_;
|
||||
};
|
||||
|
||||
const ::arrow::util::SecureString& column_key(const std::string& column_path) const;
|
||||
|
||||
const ::arrow::util::SecureString& footer_key() const { return footer_key_; }
|
||||
|
||||
const std::string& aad_prefix() const { return aad_prefix_; }
|
||||
|
||||
const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
|
||||
return key_retriever_;
|
||||
}
|
||||
|
||||
bool check_plaintext_footer_integrity() const {
|
||||
return check_plaintext_footer_integrity_;
|
||||
}
|
||||
|
||||
bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
|
||||
|
||||
const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
|
||||
return aad_prefix_verifier_;
|
||||
}
|
||||
|
||||
private:
|
||||
::arrow::util::SecureString footer_key_;
|
||||
std::string aad_prefix_;
|
||||
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
|
||||
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
|
||||
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
|
||||
bool check_plaintext_footer_integrity_;
|
||||
bool plaintext_files_allowed_;
|
||||
|
||||
FileDecryptionProperties(
|
||||
::arrow::util::SecureString footer_key,
|
||||
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
|
||||
bool check_plaintext_footer_integrity, std::string aad_prefix,
|
||||
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
|
||||
ColumnPathToDecryptionPropertiesMap column_decryption_properties,
|
||||
bool plaintext_files_allowed);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileEncryptionProperties {
|
||||
public:
|
||||
class PARQUET_EXPORT Builder {
|
||||
public:
|
||||
explicit Builder(::arrow::util::SecureString footer_key)
|
||||
: parquet_cipher_(kDefaultEncryptionAlgorithm),
|
||||
encrypted_footer_(kDefaultEncryptedFooter),
|
||||
footer_key_(std::move(footer_key)) {
|
||||
store_aad_prefix_in_file_ = false;
|
||||
}
|
||||
|
||||
/// Create files with plaintext footer.
|
||||
/// If not called, the files will be created with encrypted footer (default).
|
||||
Builder* set_plaintext_footer() {
|
||||
encrypted_footer_ = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Set encryption algorithm.
|
||||
/// If not called, files will be encrypted with AES_GCM_V1 (default).
|
||||
Builder* algorithm(ParquetCipher::type parquet_cipher) {
|
||||
parquet_cipher_ = parquet_cipher;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// Set a key retrieval metadata (converted from String).
|
||||
/// use either footer_key_metadata or footer_key_id, not both.
|
||||
Builder* footer_key_id(std::string key_id);
|
||||
|
||||
/// Set a key retrieval metadata.
|
||||
/// use either footer_key_metadata or footer_key_id, not both.
|
||||
Builder* footer_key_metadata(std::string footer_key_metadata);
|
||||
|
||||
/// Set the file AAD Prefix.
|
||||
Builder* aad_prefix(std::string aad_prefix);
|
||||
|
||||
/// Skip storing AAD Prefix in file.
|
||||
/// If not called, and if AAD Prefix is set, it will be stored.
|
||||
Builder* disable_aad_prefix_storage();
|
||||
|
||||
/// Set the list of encrypted columns and their properties (keys etc).
|
||||
/// If not called, all columns will be encrypted with the footer key.
|
||||
/// If called, the file columns not in the list will be left unencrypted.
|
||||
Builder* encrypted_columns(ColumnPathToEncryptionPropertiesMap encrypted_columns);
|
||||
|
||||
std::shared_ptr<FileEncryptionProperties> build() {
|
||||
return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
|
||||
parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
|
||||
aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
|
||||
}
|
||||
|
||||
private:
|
||||
ParquetCipher::type parquet_cipher_;
|
||||
bool encrypted_footer_;
|
||||
::arrow::util::SecureString footer_key_;
|
||||
std::string footer_key_metadata_;
|
||||
|
||||
std::string aad_prefix_;
|
||||
bool store_aad_prefix_in_file_;
|
||||
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
|
||||
};
|
||||
|
||||
bool encrypted_footer() const { return encrypted_footer_; }
|
||||
|
||||
EncryptionAlgorithm algorithm() const { return algorithm_; }
|
||||
|
||||
const ::arrow::util::SecureString& footer_key() const { return footer_key_; }
|
||||
|
||||
const std::string& footer_key_metadata() const { return footer_key_metadata_; }
|
||||
|
||||
const std::string& file_aad() const { return file_aad_; }
|
||||
|
||||
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
|
||||
const std::string& column_path);
|
||||
|
||||
const ColumnPathToEncryptionPropertiesMap& encrypted_columns() const {
|
||||
return encrypted_columns_;
|
||||
}
|
||||
|
||||
private:
|
||||
EncryptionAlgorithm algorithm_;
|
||||
::arrow::util::SecureString footer_key_;
|
||||
std::string footer_key_metadata_;
|
||||
bool encrypted_footer_;
|
||||
std::string file_aad_;
|
||||
std::string aad_prefix_;
|
||||
bool store_aad_prefix_in_file_;
|
||||
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
|
||||
|
||||
FileEncryptionProperties(ParquetCipher::type cipher,
|
||||
::arrow::util::SecureString footer_key,
|
||||
std::string footer_key_metadata, bool encrypted_footer,
|
||||
std::string aad_prefix, bool store_aad_prefix_in_file,
|
||||
ColumnPathToEncryptionPropertiesMap encrypted_columns);
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,57 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
/// Stores encryption key material outside the Parquet file, for example in a separate
|
||||
/// small file in the same folder. This is important for “key rotation”, when MEKs have to
|
||||
/// be changed (if compromised; or periodically, just in case) - without modifying the
|
||||
/// Parquet files (often immutable).
|
||||
class PARQUET_EXPORT FileKeyMaterialStore {
|
||||
public:
|
||||
/// Add key material for one encryption key.
|
||||
virtual void AddKeyMaterial(std::string key_id_in_file, std::string key_material) = 0;
|
||||
|
||||
/// Get key material
|
||||
virtual std::string GetKeyMaterial(std::string key_id_in_file) = 0;
|
||||
|
||||
/// After key material was added for all keys in the given Parquet file,
|
||||
/// save material in persistent store.
|
||||
virtual void SaveMaterial() = 0;
|
||||
|
||||
/// Remove key material from persistent store. Used in key rotation.
|
||||
virtual void RemoveMaterial() = 0;
|
||||
|
||||
/// Move key material to another store. Used in key rotation.
|
||||
virtual void MoveMaterialTo(std::shared_ptr<FileKeyMaterialStore> target_key_store) = 0;
|
||||
|
||||
/// Returns the Set of all key IDs in this store (for the given Parquet file)
|
||||
virtual std::vector<std::string> GetKeyIDSet() = 0;
|
||||
|
||||
virtual ~FileKeyMaterialStore() {}
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,96 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
#include "arrow/util/secure_string.h"
|
||||
|
||||
#include "parquet/encryption/encryption.h"
|
||||
#include "parquet/encryption/file_system_key_material_store.h"
|
||||
#include "parquet/encryption/key_material.h"
|
||||
#include "parquet/encryption/key_toolkit.h"
|
||||
#include "parquet/encryption/key_toolkit_internal.h"
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
// This class will retrieve the key from "key metadata", following these steps:
|
||||
// 1. Parse "key metadata" (see structure in KeyMetadata class).
|
||||
// 2. Retrieve "key material" which can be stored inside or outside "key metadata".
|
||||
// 3. Unwrap the "data encryption key" from "key material". There are 2 modes:
|
||||
// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master
|
||||
// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted
|
||||
// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above
|
||||
// "key encryption key"
|
||||
class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever {
|
||||
public:
|
||||
/// key_toolkit and kms_connection_config is to get KmsClient from cache or create
|
||||
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
|
||||
/// KmsClient in the cache.
|
||||
/// If the file uses external key material then the Parquet file path and file
|
||||
/// system must be specified.
|
||||
FileKeyUnwrapper(std::shared_ptr<KeyToolkit> key_toolkit,
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
double cache_lifetime_seconds, const std::string& file_path = "",
|
||||
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
|
||||
|
||||
/// Constructor overload that takes a raw pointer to the KeyToolkit
|
||||
FileKeyUnwrapper(KeyToolkit* key_toolkit,
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
double cache_lifetime_seconds, const std::string& file_path = "",
|
||||
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
|
||||
|
||||
/// Constructor overload that takes a raw pointer to the KeyToolkit and
|
||||
/// accepts an existing key_material_store rather than using
|
||||
/// the file path and file system to create one when needed.
|
||||
FileKeyUnwrapper(KeyToolkit* key_toolkit,
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
double cache_lifetime_seconds,
|
||||
std::shared_ptr<FileKeyMaterialStore> key_material_store);
|
||||
|
||||
/// Get the data key from key metadata
|
||||
::arrow::util::SecureString GetKey(const std::string& key_metadata_bytes) override;
|
||||
|
||||
/// Get the data key along with the master key id from key material
|
||||
KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material);
|
||||
|
||||
private:
|
||||
FileKeyUnwrapper(std::shared_ptr<KeyToolkit> key_toolkit_owner, KeyToolkit* key_toolkit,
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
double cache_lifetime_seconds,
|
||||
std::shared_ptr<FileKeyMaterialStore> key_material_store,
|
||||
const std::string& file_path,
|
||||
const std::shared_ptr<::arrow::fs::FileSystem>& file_system);
|
||||
|
||||
std::shared_ptr<KmsClient> GetKmsClientFromConfigOrKeyMaterial(
|
||||
const KeyMaterial& key_material);
|
||||
|
||||
/// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token
|
||||
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, ::arrow::util::SecureString>>
|
||||
kek_per_kek_id_;
|
||||
std::shared_ptr<KeyToolkit> key_toolkit_owner_;
|
||||
KeyToolkit* key_toolkit_;
|
||||
KmsConnectionConfig kms_connection_config_;
|
||||
const double cache_entry_lifetime_seconds_;
|
||||
std::shared_ptr<FileKeyMaterialStore> key_material_store_;
|
||||
const std::string file_path_;
|
||||
std::shared_ptr<::arrow::fs::FileSystem> file_system_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,84 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
|
||||
#include "parquet/encryption/file_key_material_store.h"
|
||||
#include "parquet/encryption/key_encryption_key.h"
|
||||
#include "parquet/encryption/key_toolkit.h"
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
// This class will generate "key metadata" from "data encryption key" and "master key",
|
||||
// following these steps:
|
||||
// 1. Wrap "data encryption key". There are 2 modes:
|
||||
// 1.1. single wrapping: encrypt "data encryption key" directly with "master encryption
|
||||
// key"
|
||||
// 1.2. double wrapping: 2 steps:
|
||||
// 1.2.1. "key encryption key" is randomized (see KeyEncryptionKey class)
|
||||
// 1.2.2. "data encryption key" is encrypted with the above "key encryption key"
|
||||
// 2. Create "key material" (see structure in KeyMaterial class)
|
||||
// 3. Create "key metadata" with "key material" inside or a reference to outside "key
|
||||
// material" (see structure in KeyMetadata class).
|
||||
class PARQUET_EXPORT FileKeyWrapper {
|
||||
public:
|
||||
static constexpr int kKeyEncryptionKeyLength = 16;
|
||||
static constexpr int kKeyEncryptionKeyIdLength = 16;
|
||||
|
||||
/// key_toolkit and kms_connection_config is to get KmsClient from the cache or create
|
||||
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
|
||||
/// KmsClient in the cache. key_material_store is to store "key material" outside
|
||||
/// parquet file, NULL if "key material" is stored inside parquet file.
|
||||
FileKeyWrapper(KeyToolkit* key_toolkit,
|
||||
const KmsConnectionConfig& kms_connection_config,
|
||||
std::shared_ptr<FileKeyMaterialStore> key_material_store,
|
||||
double cache_entry_lifetime_seconds, bool double_wrapping);
|
||||
|
||||
/// Creates key_metadata field for a given data key, via wrapping the key with the
|
||||
/// master key.
|
||||
/// When external key material is used, an identifier is usually generated automatically
|
||||
/// but may be specified explicitly to support key rotation,
|
||||
/// which requires keeping the same identifiers.
|
||||
std::string GetEncryptionKeyMetadata(const ::arrow::util::SecureString& data_key,
|
||||
const std::string& master_key_id,
|
||||
bool is_footer_key,
|
||||
std::string key_id_in_file = "");
|
||||
|
||||
private:
|
||||
KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id);
|
||||
|
||||
/// A map of Master Encryption Key ID -> KeyEncryptionKey, for the current token
|
||||
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, KeyEncryptionKey>>
|
||||
kek_per_master_key_id_;
|
||||
|
||||
std::shared_ptr<KmsClient> kms_client_;
|
||||
KmsConnectionConfig kms_connection_config_;
|
||||
std::shared_ptr<FileKeyMaterialStore> key_material_store_;
|
||||
const double cache_entry_lifetime_seconds_;
|
||||
const bool double_wrapping_;
|
||||
uint16_t key_counter_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,89 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
|
||||
#include "parquet/encryption/file_key_material_store.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
/// A FileKeyMaterialStore that stores key material in a file system file in the same
|
||||
/// folder as the Parquet file.
|
||||
class PARQUET_EXPORT FileSystemKeyMaterialStore : public FileKeyMaterialStore {
|
||||
public:
|
||||
static constexpr const char kKeyMaterialFilePrefix[] = "_KEY_MATERIAL_FOR_";
|
||||
static constexpr const char kTempFilePrefix[] = "_TMP";
|
||||
static constexpr const char kKeyMaterialFileSuffix[] = ".json";
|
||||
|
||||
FileSystemKeyMaterialStore() {}
|
||||
FileSystemKeyMaterialStore(std::string key_material_file_path,
|
||||
std::shared_ptr<::arrow::fs::FileSystem> file_system);
|
||||
|
||||
/// Creates a new file system key material store for a parquet file.
|
||||
/// When use_tmp_prefix is true, files are saved with an extra _TMP prefix so they don't
|
||||
/// conflict with existing external material files. This is useful during key rotation
|
||||
/// so that temporary key material files can be created while using the existing key
|
||||
/// material, before moving the key material to the non-temporary location.
|
||||
static std::shared_ptr<FileSystemKeyMaterialStore> Make(
|
||||
std::string parquet_file_path, std::shared_ptr<::arrow::fs::FileSystem> file_system,
|
||||
bool use_tmp_prefix);
|
||||
|
||||
/// Add key material for one encryption key.
|
||||
void AddKeyMaterial(std::string key_id_in_file, std::string key_material) {
|
||||
key_material_map_.emplace(std::move(key_id_in_file), std::move(key_material));
|
||||
}
|
||||
|
||||
/// Get key material
|
||||
std::string GetKeyMaterial(std::string key_id_in_file) {
|
||||
if (key_material_map_.empty()) {
|
||||
LoadKeyMaterialMap();
|
||||
}
|
||||
auto found = key_material_map_.find(key_id_in_file);
|
||||
return found->second;
|
||||
}
|
||||
|
||||
/// After key material was added for all keys in the given Parquet file,
|
||||
/// save material in persistent store.
|
||||
void SaveMaterial();
|
||||
|
||||
/// Remove key material from persistent store. Used in key rotation.
|
||||
void RemoveMaterial();
|
||||
|
||||
/// Move key material to another store. Used in key rotation.
|
||||
void MoveMaterialTo(std::shared_ptr<FileKeyMaterialStore> target_key_store);
|
||||
|
||||
/// Returns the Set of all key IDs in this store (for the given Parquet file)
|
||||
std::vector<std::string> GetKeyIDSet();
|
||||
|
||||
private:
|
||||
std::string GetStorageFilePath() { return key_material_file_path_; }
|
||||
|
||||
std::string BuildKeyMaterialMapJson();
|
||||
void LoadKeyMaterialMap();
|
||||
std::string key_material_file_path_;
|
||||
std::shared_ptr<::arrow::fs::FileSystem> file_system_;
|
||||
/// Maps ID of a key in Parquet file and key material
|
||||
std::unordered_map<std::string, std::string> key_material_map_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,58 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/base64.h"
|
||||
#include "arrow/util/secure_string.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
// In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key
|
||||
// encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK).
|
||||
// In a writer process, a random KEK is generated for each MEK ID, and cached in a <MEK-ID
|
||||
// : KEK> map. This allows to perform an interaction with a KMS server only once for each
|
||||
// MEK, in order to wrap its KEK. "Data encryption key" (DEK) wrapping is performed
|
||||
// locally, and does not involve an interaction with a KMS server.
|
||||
class KeyEncryptionKey {
|
||||
public:
|
||||
KeyEncryptionKey(::arrow::util::SecureString kek_bytes, std::string kek_id,
|
||||
std::string encoded_wrapped_kek)
|
||||
: kek_bytes_(std::move(kek_bytes)),
|
||||
kek_id_(std::move(kek_id)),
|
||||
encoded_kek_id_(::arrow::util::base64_encode(kek_id_)),
|
||||
encoded_wrapped_kek_(std::move(encoded_wrapped_kek)) {}
|
||||
|
||||
const ::arrow::util::SecureString& kek_bytes() const { return kek_bytes_; }
|
||||
|
||||
const std::string& kek_id() const { return kek_id_; }
|
||||
|
||||
const std::string& encoded_kek_id() const { return encoded_kek_id_; }
|
||||
|
||||
const std::string& encoded_wrapped_kek() const { return encoded_wrapped_kek_; }
|
||||
|
||||
private:
|
||||
::arrow::util::SecureString kek_bytes_;
|
||||
std::string kek_id_;
|
||||
std::string encoded_kek_id_;
|
||||
std::string encoded_wrapped_kek_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,129 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace json {
|
||||
namespace internal {
|
||||
class ObjectParser;
|
||||
} // namespace internal
|
||||
} // namespace json
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
// KeyMaterial class represents the "key material", keeping the information that allows
|
||||
// readers to recover an encryption key (see description of the KeyMetadata class). The
|
||||
// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a
|
||||
// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key
|
||||
// material is generated by encrypting the "data encryption key" (DEK) by a "master key".
|
||||
// In the double wrapping mode, the key material is generated by encrypting the DEK by a
|
||||
// "key encryption key" (KEK), that in turn is encrypted by a "master key".
|
||||
//
|
||||
// Key material is kept in a flat json object, with the following fields:
|
||||
// 1. "keyMaterialType" - a String, with the type of key material. In the current
|
||||
// version, only one value is allowed - "PKMT1" (stands
|
||||
// for "parquet key management tools, version 1"). For external key material storage,
|
||||
// this field is written in both "key metadata" and "key material" jsons. For internal
|
||||
// key material storage, this field is written only once in the common json.
|
||||
// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer
|
||||
// key, and keeps additional information (such as
|
||||
// KMS instance ID and URL). If false, means that the material belongs to a column
|
||||
// key.
|
||||
// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key
|
||||
// material.
|
||||
// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key
|
||||
// material.
|
||||
// 5. "masterKeyID" - a String, with the ID of the master key used to generate the
|
||||
// material.
|
||||
// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding).
|
||||
// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in
|
||||
// double wrapping mode.
|
||||
// If false - in single wrapping mode.
|
||||
// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the
|
||||
// material. Written only in double wrapping mode.
|
||||
// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in
|
||||
// double wrapping mode.
|
||||
class PARQUET_EXPORT KeyMaterial {
|
||||
public:
|
||||
// these fields are defined in a specification and should never be changed
|
||||
static constexpr const char kKeyMaterialTypeField[] = "keyMaterialType";
|
||||
static constexpr const char kKeyMaterialType1[] = "PKMT1";
|
||||
|
||||
static constexpr const char kFooterKeyIdInFile[] = "footerKey";
|
||||
static constexpr const char kColumnKeyIdInFilePrefix[] = "columnKey";
|
||||
|
||||
static constexpr const char kIsFooterKeyField[] = "isFooterKey";
|
||||
static constexpr const char kDoubleWrappingField[] = "doubleWrapping";
|
||||
static constexpr const char kKmsInstanceIdField[] = "kmsInstanceID";
|
||||
static constexpr const char kKmsInstanceUrlField[] = "kmsInstanceURL";
|
||||
static constexpr const char kMasterKeyIdField[] = "masterKeyID";
|
||||
static constexpr const char kWrappedDataEncryptionKeyField[] = "wrappedDEK";
|
||||
static constexpr const char kKeyEncryptionKeyIdField[] = "keyEncryptionKeyID";
|
||||
static constexpr const char kWrappedKeyEncryptionKeyField[] = "wrappedKEK";
|
||||
|
||||
public:
|
||||
KeyMaterial() = default;
|
||||
|
||||
static KeyMaterial Parse(const std::string& key_material_string);
|
||||
|
||||
static KeyMaterial Parse(
|
||||
const ::arrow::json::internal::ObjectParser* key_material_json);
|
||||
|
||||
/// This method returns a json string that will be stored either inside a parquet file
|
||||
/// or in a key material store outside the parquet file.
|
||||
static std::string SerializeToJson(bool is_footer_key,
|
||||
const std::string& kms_instance_id,
|
||||
const std::string& kms_instance_url,
|
||||
const std::string& master_key_id,
|
||||
bool is_double_wrapped, const std::string& kek_id,
|
||||
const std::string& encoded_wrapped_kek,
|
||||
const std::string& encoded_wrapped_dek,
|
||||
bool is_internal_storage);
|
||||
|
||||
bool is_footer_key() const { return is_footer_key_; }
|
||||
bool is_double_wrapped() const { return is_double_wrapped_; }
|
||||
const std::string& master_key_id() const { return master_key_id_; }
|
||||
const std::string& wrapped_dek() const { return encoded_wrapped_dek_; }
|
||||
const std::string& kek_id() const { return kek_id_; }
|
||||
const std::string& wrapped_kek() const { return encoded_wrapped_kek_; }
|
||||
const std::string& kms_instance_id() const { return kms_instance_id_; }
|
||||
const std::string& kms_instance_url() const { return kms_instance_url_; }
|
||||
|
||||
private:
|
||||
KeyMaterial(bool is_footer_key, const std::string& kms_instance_id,
|
||||
const std::string& kms_instance_url, const std::string& master_key_id,
|
||||
bool is_double_wrapped, const std::string& kek_id,
|
||||
const std::string& encoded_wrapped_kek,
|
||||
const std::string& encoded_wrapped_dek);
|
||||
|
||||
bool is_footer_key_;
|
||||
std::string kms_instance_id_;
|
||||
std::string kms_instance_url_;
|
||||
std::string master_key_id_;
|
||||
bool is_double_wrapped_;
|
||||
std::string kek_id_;
|
||||
std::string encoded_wrapped_kek_;
|
||||
std::string encoded_wrapped_dek_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,91 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <variant>
|
||||
|
||||
#include "parquet/encryption/key_material.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
// Parquet encryption specification defines "key metadata" as an arbitrary byte array,
|
||||
// generated by file writers for each encryption key, and passed to the low level API for
|
||||
// storage in the file footer. The "key metadata" field is made available to file readers
|
||||
// to enable recovery of the key. This interface can be utilized for implementation
|
||||
// of any key management scheme.
|
||||
//
|
||||
// The keytools package (PARQUET-1373) implements one approach, of many possible, to key
|
||||
// management and to generation of the "key metadata" fields. This approach, based on the
|
||||
// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual
|
||||
// material, required to recover a key, in a "key material" object (see the KeyMaterial
|
||||
// class for details). This class is implemented to support version 1 of the parquet key
|
||||
// management tools specification.
|
||||
//
|
||||
// KeyMetadata writes (and reads) the "key metadata" field as a flat json object,
|
||||
// with the following fields:
|
||||
// 1. "keyMaterialType" - a String, with the type of key material.
|
||||
// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the
|
||||
// "key metadata" field. If false, "key material" is kept externally (outside Parquet
|
||||
// files) - in this case, "key metadata" keeps a reference to the external "key material".
|
||||
// 3. "keyReference" - a String, with the reference to the external "key material".
|
||||
// Written only if internalStorage is false.
|
||||
//
|
||||
// If internalStorage is true, "key material" is a part of "key metadata", and the json
|
||||
// keeps additional fields, described in the KeyMaterial class.
|
||||
class PARQUET_EXPORT KeyMetadata {
|
||||
public:
|
||||
static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage";
|
||||
static constexpr const char kKeyReferenceField[] = "keyReference";
|
||||
|
||||
/// key_metadata_bytes is the key metadata field stored in the parquet file,
|
||||
/// in the serialized json object format.
|
||||
static KeyMetadata Parse(const std::string& key_metadata_bytes);
|
||||
|
||||
static std::string CreateSerializedForExternalMaterial(
|
||||
const std::string& key_reference);
|
||||
|
||||
bool key_material_stored_internally() const { return is_internal_storage_; }
|
||||
|
||||
const KeyMaterial& key_material() const {
|
||||
if (!is_internal_storage_) {
|
||||
throw ParquetException("key material is stored externally.");
|
||||
}
|
||||
return ::std::get<KeyMaterial>(key_material_or_reference_);
|
||||
}
|
||||
|
||||
const std::string& key_reference() const {
|
||||
if (is_internal_storage_) {
|
||||
throw ParquetException("key material is stored internally.");
|
||||
}
|
||||
return ::std::get<std::string>(key_material_or_reference_);
|
||||
}
|
||||
|
||||
private:
|
||||
explicit KeyMetadata(const KeyMaterial& key_material);
|
||||
explicit KeyMetadata(const std::string& key_reference);
|
||||
|
||||
bool is_internal_storage_;
|
||||
/// If is_internal_storage_ is true, KeyMaterial is set,
|
||||
/// else a string referencing to an outside "key material" is set.
|
||||
::std::variant<KeyMaterial, std::string> key_material_or_reference_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,106 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "parquet/encryption/key_encryption_key.h"
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/encryption/kms_client_factory.h"
|
||||
#include "parquet/encryption/two_level_cache_with_expiration.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
static constexpr uint64_t kCacheCleanPeriodForKeyRotation = 60 * 60; // 1 hour
|
||||
|
||||
// KeyToolkit is a utility that keeps various tools for key management (such as key
|
||||
// rotation, kms client instantiation, cache control, etc), plus a number of auxiliary
|
||||
// classes for internal use.
|
||||
class PARQUET_EXPORT KeyToolkit {
|
||||
public:
|
||||
KeyToolkit() { last_cache_clean_for_key_rotation_time_ = {}; }
|
||||
|
||||
/// KMS client two level cache: token -> KMSInstanceId -> KmsClient
|
||||
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>>& kms_client_cache_per_token() {
|
||||
return kms_client_cache_;
|
||||
}
|
||||
/// Key encryption key two level cache for wrapping: token -> MasterEncryptionKeyId ->
|
||||
/// KeyEncryptionKey
|
||||
TwoLevelCacheWithExpiration<KeyEncryptionKey>& kek_write_cache_per_token() {
|
||||
return key_encryption_key_write_cache_;
|
||||
}
|
||||
|
||||
/// Key encryption key two level cache for unwrapping: token -> KeyEncryptionKeyId ->
|
||||
/// KeyEncryptionKeyBytes
|
||||
TwoLevelCacheWithExpiration<::arrow::util::SecureString>& kek_read_cache_per_token() {
|
||||
return key_encryption_key_read_cache_;
|
||||
}
|
||||
|
||||
std::shared_ptr<KmsClient> GetKmsClient(
|
||||
const KmsConnectionConfig& kms_connection_config, double cache_entry_lifetime_ms);
|
||||
|
||||
/// Flush any caches that are tied to the (compromised) access_token
|
||||
void RemoveCacheEntriesForToken(const std::string& access_token);
|
||||
|
||||
void RemoveCacheEntriesForAllTokens();
|
||||
|
||||
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory) {
|
||||
if (kms_client_factory_ != NULLPTR) {
|
||||
throw ParquetException("KMS client factory has already been registered.");
|
||||
}
|
||||
kms_client_factory_ = std::move(kms_client_factory);
|
||||
}
|
||||
|
||||
/// Key rotation. In the single wrapping mode, decrypts data keys with old master keys,
|
||||
/// then encrypts them with new master keys. In the double wrapping mode, decrypts KEKs
|
||||
/// (key encryption keys) with old master keys, generates new KEKs and encrypts them
|
||||
/// with new master keys. Works only if key material is not stored internally in file
|
||||
/// footers. Not supported in local key wrapping mode. Method can be run by multiple
|
||||
/// threads, but each thread must work on different files.
|
||||
void RotateMasterKeys(const KmsConnectionConfig& kms_connection_config,
|
||||
const std::string& parquet_file_path,
|
||||
const std::shared_ptr<::arrow::fs::FileSystem>& file_system,
|
||||
bool double_wrapping, double cache_lifetime_seconds);
|
||||
|
||||
private:
|
||||
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>> kms_client_cache_;
|
||||
TwoLevelCacheWithExpiration<KeyEncryptionKey> key_encryption_key_write_cache_;
|
||||
TwoLevelCacheWithExpiration<::arrow::util::SecureString> key_encryption_key_read_cache_;
|
||||
std::shared_ptr<KmsClientFactory> kms_client_factory_;
|
||||
mutable ::arrow::util::Mutex last_cache_clean_for_key_rotation_time_mutex_;
|
||||
internal::TimePoint last_cache_clean_for_key_rotation_time_;
|
||||
};
|
||||
|
||||
// "data encryption key" and "master key identifier" are paired together as output when
|
||||
// parsing from "key material"
|
||||
class PARQUET_EXPORT KeyWithMasterId {
|
||||
public:
|
||||
KeyWithMasterId(::arrow::util::SecureString key_bytes, std::string master_id)
|
||||
: key_bytes_(std::move(key_bytes)), master_id_(std::move(master_id)) {}
|
||||
|
||||
const ::arrow::util::SecureString& data_key() const { return key_bytes_; }
|
||||
const std::string& master_id() const { return master_id_; }
|
||||
|
||||
private:
|
||||
::arrow::util::SecureString key_bytes_;
|
||||
std::string master_id_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,97 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/mutex.h"
|
||||
#include "arrow/util/secure_string.h"
|
||||
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
/// This class wraps the key access token of a KMS server. If your token changes over
|
||||
/// time, you should keep the reference to the KeyAccessToken object and call Refresh()
|
||||
/// method every time you have a new token.
|
||||
class PARQUET_EXPORT KeyAccessToken {
|
||||
public:
|
||||
KeyAccessToken() = default;
|
||||
|
||||
explicit KeyAccessToken(const std::string value) : value_(value) {}
|
||||
|
||||
void Refresh(const std::string& new_value) {
|
||||
auto lock = mutex_.Lock();
|
||||
value_ = new_value;
|
||||
}
|
||||
|
||||
const std::string& value() const {
|
||||
auto lock = mutex_.Lock();
|
||||
return value_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string value_;
|
||||
mutable ::arrow::util::Mutex mutex_;
|
||||
};
|
||||
|
||||
struct PARQUET_EXPORT KmsConnectionConfig {
|
||||
std::string kms_instance_id;
|
||||
std::string kms_instance_url;
|
||||
/// If the access token is changed in the future, you should keep a reference to
|
||||
/// this object and call Refresh() on it whenever there is a new access token.
|
||||
std::shared_ptr<KeyAccessToken> refreshable_key_access_token;
|
||||
std::unordered_map<std::string, std::string> custom_kms_conf;
|
||||
|
||||
KmsConnectionConfig();
|
||||
|
||||
const std::string& key_access_token() const {
|
||||
if (refreshable_key_access_token == NULLPTR ||
|
||||
refreshable_key_access_token->value().empty()) {
|
||||
throw ParquetException("key access token is not set!");
|
||||
}
|
||||
return refreshable_key_access_token->value();
|
||||
}
|
||||
|
||||
void SetDefaultIfEmpty();
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT KmsClient {
|
||||
public:
|
||||
static constexpr const char kKmsInstanceIdDefault[] = "DEFAULT";
|
||||
static constexpr const char kKmsInstanceUrlDefault[] = "DEFAULT";
|
||||
static constexpr const char kKeyAccessTokenDefault[] = "DEFAULT";
|
||||
|
||||
/// \brief Wraps a key.
|
||||
///
|
||||
/// Encrypts it with the master key, encodes the result
|
||||
/// and potentially adds a KMS-specific metadata.
|
||||
virtual std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
|
||||
const std::string& master_key_identifier) = 0;
|
||||
|
||||
/// \brief Decrypts (unwraps) a key with the master key.
|
||||
virtual ::arrow::util::SecureString UnwrapKey(
|
||||
const std::string& wrapped_key, const std::string& master_key_identifier) = 0;
|
||||
|
||||
virtual ~KmsClient() {}
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,38 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
class PARQUET_EXPORT KmsClientFactory {
|
||||
public:
|
||||
explicit KmsClientFactory(bool wrap_locally = false) : wrap_locally_(wrap_locally) {}
|
||||
|
||||
virtual ~KmsClientFactory() = default;
|
||||
|
||||
virtual std::shared_ptr<KmsClient> CreateKmsClient(
|
||||
const KmsConnectionConfig& kms_connection_config) = 0;
|
||||
|
||||
protected:
|
||||
bool wrap_locally_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,95 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
|
||||
#include "parquet/encryption/kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
/// This class supports local wrapping mode, master keys will be fetched from the KMS
|
||||
/// server and used to encrypt other keys (data encryption keys or key encryption keys).
|
||||
class PARQUET_EXPORT LocalWrapKmsClient : public KmsClient {
|
||||
public:
|
||||
static constexpr const char kLocalWrapNoKeyVersion[] = "NO_VERSION";
|
||||
|
||||
explicit LocalWrapKmsClient(const KmsConnectionConfig& kms_connection_config);
|
||||
|
||||
std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
::arrow::util::SecureString UnwrapKey(
|
||||
const std::string& wrapped_key, const std::string& master_key_identifier) override;
|
||||
|
||||
protected:
|
||||
/// Get master key from the remote KMS server.
|
||||
/// Note: this function might be called by multiple threads
|
||||
virtual const ::arrow::util::SecureString& GetMasterKeyFromServer(
|
||||
const std::string& master_key_identifier) = 0;
|
||||
|
||||
private:
|
||||
/// KMS systems wrap keys by encrypting them by master keys, and attaching additional
|
||||
/// information (such as the version number of the masker key) to the result of
|
||||
/// encryption. The master key version is required in key rotation. Currently, the
|
||||
/// local wrapping mode does not support key rotation (because not all KMS systems allow
|
||||
/// to fetch a master key by its ID and version number). Still, the local wrapping mode
|
||||
/// adds a placeholder for the master key version, that will enable support for key
|
||||
/// rotation in this mode in the future, with appropriate KMS systems. This will also
|
||||
/// enable backward compatibility, where future readers will be able to extract master
|
||||
/// key version in the files written by the current code.
|
||||
///
|
||||
/// LocalKeyWrap class writes (and reads) the "key wrap" as a flat json with the
|
||||
/// following fields:
|
||||
/// 1. "masterKeyVersion" - a String, with the master key version. In the current
|
||||
/// version, only one value is allowed - "NO_VERSION".
|
||||
/// 2. "encryptedKey" - a String, with the key encrypted by the master key
|
||||
/// (base64-encoded).
|
||||
class LocalKeyWrap {
|
||||
public:
|
||||
static constexpr const char kLocalWrapKeyVersionField[] = "masterKeyVersion";
|
||||
static constexpr const char kLocalWrapEncryptedKeyField[] = "encryptedKey";
|
||||
|
||||
LocalKeyWrap(std::string master_key_version, std::string encrypted_encoded_key);
|
||||
|
||||
static std::string CreateSerialized(const std::string& encrypted_encoded_key);
|
||||
|
||||
static LocalKeyWrap Parse(const std::string& wrapped_key);
|
||||
|
||||
const std::string& master_key_version() const { return master_key_version_; }
|
||||
|
||||
const std::string& encrypted_encoded_key() const { return encrypted_encoded_key_; }
|
||||
|
||||
private:
|
||||
std::string encrypted_encoded_key_;
|
||||
std::string master_key_version_;
|
||||
};
|
||||
|
||||
const ::arrow::util::SecureString& GetKeyFromServer(const std::string& key_identifier);
|
||||
|
||||
protected:
|
||||
KmsConnectionConfig kms_connection_config_;
|
||||
::arrow::util::ConcurrentMap<std::string, ::arrow::util::SecureString>
|
||||
master_key_cache_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,135 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module defines an abstract interface for iterating through pages in a
|
||||
// Parquet column chunk within a row group. It could be extended in the future
|
||||
// to iterate through all data pages in all chunks in a file.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "arrow/filesystem/filesystem.h"
|
||||
#include "arrow/filesystem/localfs.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/io_util.h"
|
||||
#include "arrow/util/secure_string.h"
|
||||
|
||||
#include "parquet/encryption/encryption.h"
|
||||
#include "parquet/test_util.h"
|
||||
|
||||
namespace parquet {
|
||||
class ParquetFileReader;
|
||||
namespace encryption::test {
|
||||
|
||||
using ::arrow::internal::TemporaryDir;
|
||||
using ::arrow::util::SecureString;
|
||||
|
||||
constexpr int kFixedLength = 10;
|
||||
|
||||
const SecureString kFooterEncryptionKey("0123456789012345");
|
||||
const SecureString kColumnEncryptionKey1("1234567890123450");
|
||||
const SecureString kColumnEncryptionKey2("1234567890123451");
|
||||
const char kFileName[] = "tester";
|
||||
|
||||
// Get the path of file inside parquet test data directory
|
||||
std::string data_file(const char* file);
|
||||
|
||||
// A temporary directory that contains the encrypted files generated in the tests.
|
||||
extern std::unique_ptr<TemporaryDir> temp_dir;
|
||||
|
||||
inline ::arrow::Result<std::unique_ptr<TemporaryDir>> temp_data_dir() {
|
||||
return TemporaryDir::Make("parquet-encryption-test-");
|
||||
}
|
||||
|
||||
const char kDoubleFieldName[] = "double_field";
|
||||
const char kFloatFieldName[] = "float_field";
|
||||
const char kBooleanFieldName[] = "boolean_field";
|
||||
const char kInt32FieldName[] = "int32_field";
|
||||
const char kInt64FieldName[] = "int64_field";
|
||||
const char kInt96FieldName[] = "int96_field";
|
||||
const char kByteArrayFieldName[] = "ba_field";
|
||||
const char kFixedLenByteArrayFieldName[] = "flba_field";
|
||||
|
||||
const char kFooterMasterKey[] = "0123456789012345";
|
||||
const char kFooterMasterKeyId[] = "kf";
|
||||
const char* const kColumnMasterKeys[] = {"1234567890123450", "1234567890123451",
|
||||
"1234567890123452", "1234567890123453",
|
||||
"1234567890123454", "1234567890123455"};
|
||||
const char* const kColumnMasterKeyIds[] = {"kc1", "kc2", "kc3", "kc4", "kc5", "kc6"};
|
||||
|
||||
// New master key values used to simulate key rotation
|
||||
const char kNewFooterMasterKey[] = "9123456789012345";
|
||||
const char* const kNewColumnMasterKeys[] = {"9234567890123450", "9234567890123451",
|
||||
"9234567890123452", "9234567890123453",
|
||||
"9234567890123454", "9234567890123455"};
|
||||
|
||||
// The result of this function will be used to set into TestOnlyInMemoryKmsClientFactory
|
||||
// as the key mapping to look at.
|
||||
std::unordered_map<std::string, SecureString> BuildKeyMap(const char* const* column_ids,
|
||||
const char* const* column_keys,
|
||||
const char* footer_id,
|
||||
const char* footer_key);
|
||||
|
||||
// The result of this function will be used to set into EncryptionConfiguration
|
||||
// as column keys.
|
||||
std::string BuildColumnKeyMapping();
|
||||
|
||||
// FileEncryptor and FileDecryptor are helper classes to write/read an encrypted parquet
|
||||
// file corresponding to each pair of FileEncryptionProperties/FileDecryptionProperties.
|
||||
// FileEncryptor writes the file with fixed data values and FileDecryptor reads the file
|
||||
// and verify the correctness of data values.
|
||||
class FileEncryptor {
|
||||
public:
|
||||
FileEncryptor();
|
||||
|
||||
void EncryptFile(
|
||||
std::string file,
|
||||
std::shared_ptr<parquet::FileEncryptionProperties> encryption_configurations);
|
||||
|
||||
private:
|
||||
std::shared_ptr<schema::GroupNode> SetupEncryptionSchema();
|
||||
|
||||
int num_rowgroups_ = 5;
|
||||
int rows_per_rowgroup_ = 50;
|
||||
std::shared_ptr<schema::GroupNode> schema_;
|
||||
};
|
||||
|
||||
class FileDecryptor {
|
||||
public:
|
||||
void DecryptFile(
|
||||
const std::string& file_name,
|
||||
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
|
||||
void DecryptPageIndex(
|
||||
const std::string& file_name,
|
||||
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
|
||||
|
||||
private:
|
||||
void CheckFile(
|
||||
parquet::ParquetFileReader* file_reader,
|
||||
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
|
||||
void CheckPageIndex(
|
||||
parquet::ParquetFileReader* file_reader,
|
||||
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
|
||||
};
|
||||
|
||||
} // namespace encryption::test
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,101 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/base64.h"
|
||||
|
||||
#include "parquet/encryption/kms_client_factory.h"
|
||||
#include "parquet/encryption/local_wrap_kms_client.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
// This is a mock class, built for testing only. Don't use it as an example of
|
||||
// LocalWrapKmsClient implementation.
|
||||
class TestOnlyLocalWrapInMemoryKms : public LocalWrapKmsClient {
|
||||
public:
|
||||
explicit TestOnlyLocalWrapInMemoryKms(const KmsConnectionConfig& kms_connection_config);
|
||||
|
||||
static void InitializeMasterKeys(
|
||||
const std::unordered_map<std::string, ::arrow::util::SecureString>&
|
||||
master_keys_map);
|
||||
|
||||
protected:
|
||||
const ::arrow::util::SecureString& GetMasterKeyFromServer(
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
private:
|
||||
static std::unordered_map<std::string, ::arrow::util::SecureString> master_key_map_;
|
||||
};
|
||||
|
||||
// This is a mock class, built for testing only. Don't use it as an example of KmsClient
|
||||
// implementation.
|
||||
class TestOnlyInServerWrapKms : public KmsClient {
|
||||
public:
|
||||
static void InitializeMasterKeys(
|
||||
const std::unordered_map<std::string, ::arrow::util::SecureString>&
|
||||
master_keys_map);
|
||||
|
||||
std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
|
||||
const std::string& master_key_identifier) override;
|
||||
|
||||
::arrow::util::SecureString UnwrapKey(
|
||||
const std::string& wrapped_key, const std::string& master_key_identifier) override;
|
||||
|
||||
static void StartKeyRotation(
|
||||
const std::unordered_map<std::string, ::arrow::util::SecureString>&
|
||||
new_master_keys_map);
|
||||
static void FinishKeyRotation();
|
||||
|
||||
private:
|
||||
::arrow::util::SecureString GetMasterKeyFromServer(
|
||||
const std::string& master_key_identifier);
|
||||
|
||||
// Different wrapping and unwrapping key maps to imitate versioning
|
||||
// and support key rotation.
|
||||
static std::unordered_map<std::string, ::arrow::util::SecureString>
|
||||
unwrapping_master_key_map_;
|
||||
static std::unordered_map<std::string, ::arrow::util::SecureString>
|
||||
wrapping_master_key_map_;
|
||||
};
|
||||
|
||||
// This is a mock class, built for testing only. Don't use it as an example of
|
||||
// KmsClientFactory implementation.
|
||||
class TestOnlyInMemoryKmsClientFactory : public KmsClientFactory {
|
||||
public:
|
||||
TestOnlyInMemoryKmsClientFactory(
|
||||
bool wrap_locally,
|
||||
const std::unordered_map<std::string, ::arrow::util::SecureString>& master_keys_map)
|
||||
: KmsClientFactory(wrap_locally) {
|
||||
TestOnlyLocalWrapInMemoryKms::InitializeMasterKeys(master_keys_map);
|
||||
TestOnlyInServerWrapKms::InitializeMasterKeys(master_keys_map);
|
||||
}
|
||||
|
||||
std::shared_ptr<KmsClient> CreateKmsClient(
|
||||
const KmsConnectionConfig& kms_connection_config) {
|
||||
if (wrap_locally_) {
|
||||
return std::make_shared<TestOnlyLocalWrapInMemoryKms>(kms_connection_config);
|
||||
} else {
|
||||
return std::make_shared<TestOnlyInServerWrapKms>();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,149 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "arrow/util/concurrent_map.h"
|
||||
#include "arrow/util/mutex.h"
|
||||
|
||||
namespace parquet::encryption {
|
||||
|
||||
using ::arrow::util::ConcurrentMap;
|
||||
|
||||
namespace internal {
|
||||
|
||||
using TimePoint =
|
||||
std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<double>>;
|
||||
|
||||
inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); }
|
||||
|
||||
template <typename E>
|
||||
class ExpiringCacheEntry {
|
||||
public:
|
||||
ExpiringCacheEntry() = default;
|
||||
|
||||
ExpiringCacheEntry(E cached_item, double expiration_interval_seconds)
|
||||
: expiration_timestamp_(CurrentTimePoint() +
|
||||
std::chrono::duration<double>(expiration_interval_seconds)),
|
||||
cached_item_(std::move(cached_item)) {}
|
||||
|
||||
bool IsExpired() const {
|
||||
const auto now = CurrentTimePoint();
|
||||
return (now > expiration_timestamp_);
|
||||
}
|
||||
|
||||
E cached_item() { return cached_item_; }
|
||||
|
||||
private:
|
||||
const TimePoint expiration_timestamp_;
|
||||
E cached_item_;
|
||||
};
|
||||
|
||||
// This class is to avoid the below warning when compiling KeyToolkit class with VS2015
|
||||
// warning C4503: decorated name length exceeded, name was truncated
|
||||
template <typename V>
|
||||
class ExpiringCacheMapEntry {
|
||||
public:
|
||||
ExpiringCacheMapEntry() = default;
|
||||
|
||||
explicit ExpiringCacheMapEntry(
|
||||
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item,
|
||||
double expiration_interval_seconds)
|
||||
: map_cache_(cached_item, expiration_interval_seconds) {}
|
||||
|
||||
bool IsExpired() { return map_cache_.IsExpired(); }
|
||||
|
||||
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item() {
|
||||
return map_cache_.cached_item();
|
||||
}
|
||||
|
||||
private:
|
||||
// ConcurrentMap object may be accessed and modified at many places at the same time,
|
||||
// from multiple threads, or even removed from cache.
|
||||
ExpiringCacheEntry<std::shared_ptr<ConcurrentMap<std::string, V>>> map_cache_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
// Two-level cache with expiration of internal caches according to token lifetime.
|
||||
// External cache is per token, internal is per string key.
|
||||
// Wrapper class around:
|
||||
// std::unordered_map<std::string,
|
||||
// internal::ExpiringCacheEntry<std::unordered_map<std::string, V>>>
|
||||
// This cache is safe to be shared between threads.
|
||||
template <typename V>
|
||||
class TwoLevelCacheWithExpiration {
|
||||
public:
|
||||
TwoLevelCacheWithExpiration() {
|
||||
last_cache_cleanup_timestamp_ = internal::CurrentTimePoint();
|
||||
}
|
||||
|
||||
std::shared_ptr<ConcurrentMap<std::string, V>> GetOrCreateInternalCache(
|
||||
const std::string& access_token, double cache_entry_lifetime_seconds) {
|
||||
auto lock = mutex_.Lock();
|
||||
|
||||
auto external_cache_entry = cache_.find(access_token);
|
||||
if (external_cache_entry == cache_.end() ||
|
||||
external_cache_entry->second.IsExpired()) {
|
||||
cache_.insert({access_token, internal::ExpiringCacheMapEntry<V>(
|
||||
std::make_shared<ConcurrentMap<std::string, V>>(),
|
||||
cache_entry_lifetime_seconds)});
|
||||
}
|
||||
|
||||
return cache_[access_token].cached_item();
|
||||
}
|
||||
|
||||
void CheckCacheForExpiredTokens(double cache_cleanup_period_seconds = 0.0) {
|
||||
auto lock = mutex_.Lock();
|
||||
|
||||
const auto now = internal::CurrentTimePoint();
|
||||
if (now > (last_cache_cleanup_timestamp_ +
|
||||
std::chrono::duration<double>(cache_cleanup_period_seconds))) {
|
||||
RemoveExpiredEntriesNoMutex();
|
||||
last_cache_cleanup_timestamp_ = now;
|
||||
}
|
||||
}
|
||||
|
||||
void Remove(const std::string& access_token) {
|
||||
auto lock = mutex_.Lock();
|
||||
cache_.erase(access_token);
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
auto lock = mutex_.Lock();
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
void RemoveExpiredEntriesNoMutex() {
|
||||
for (auto it = cache_.begin(); it != cache_.end();) {
|
||||
if (it->second.IsExpired()) {
|
||||
it = cache_.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::unordered_map<std::string, internal::ExpiringCacheMapEntry<V>> cache_;
|
||||
internal::TimePoint last_cache_cleanup_timestamp_;
|
||||
::arrow::util::Mutex mutex_;
|
||||
};
|
||||
|
||||
} // namespace parquet::encryption
|
||||
@@ -0,0 +1,28 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class Decryptor;
|
||||
class Encryptor;
|
||||
|
||||
class InternalFileDecryptor;
|
||||
class InternalFileEncryptor;
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,158 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <exception>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/string_util.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
// PARQUET-1085
|
||||
#if !defined(ARROW_UNUSED)
|
||||
# define ARROW_UNUSED(x) UNUSED(x)
|
||||
#endif
|
||||
|
||||
// Parquet exception to Arrow Status
|
||||
|
||||
#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
|
||||
#define END_PARQUET_CATCH_EXCEPTIONS \
|
||||
} \
|
||||
catch (const ::parquet::ParquetStatusException& e) { \
|
||||
return e.status(); \
|
||||
} \
|
||||
catch (const ::parquet::ParquetException& e) { \
|
||||
return ::arrow::Status::IOError(e.what()); \
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
|
||||
#define PARQUET_CATCH_NOT_OK(s) \
|
||||
BEGIN_PARQUET_CATCH_EXCEPTIONS \
|
||||
(s); \
|
||||
END_PARQUET_CATCH_EXCEPTIONS
|
||||
|
||||
// clang-format on
|
||||
|
||||
#define PARQUET_CATCH_AND_RETURN(s) \
|
||||
BEGIN_PARQUET_CATCH_EXCEPTIONS \
|
||||
return (s); \
|
||||
END_PARQUET_CATCH_EXCEPTIONS
|
||||
|
||||
// Arrow Status to Parquet exception
|
||||
|
||||
#define PARQUET_IGNORE_NOT_OK(s) \
|
||||
do { \
|
||||
::arrow::Status _s = ::arrow::ToStatus(s); \
|
||||
ARROW_UNUSED(_s); \
|
||||
} while (0)
|
||||
|
||||
#define PARQUET_THROW_NOT_OK(s) \
|
||||
do { \
|
||||
::arrow::Status _s = ::arrow::ToStatus(s); \
|
||||
if (!_s.ok()) { \
|
||||
throw ::parquet::ParquetStatusException(std::move(_s)); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
|
||||
auto status_name = (rexpr); \
|
||||
PARQUET_THROW_NOT_OK(status_name.status()); \
|
||||
lhs = std::move(status_name).ValueOrDie();
|
||||
|
||||
#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
|
||||
PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
|
||||
lhs, rexpr);
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ParquetException : public std::exception {
|
||||
public:
|
||||
PARQUET_NORETURN static void EofException(const std::string& msg = "") {
|
||||
static std::string prefix = "Unexpected end of stream";
|
||||
if (msg.empty()) {
|
||||
throw ParquetException(prefix);
|
||||
}
|
||||
throw ParquetException(prefix, ": ", msg);
|
||||
}
|
||||
|
||||
PARQUET_NORETURN static void NYI(const std::string& msg = "") {
|
||||
throw ParquetException("Not yet implemented: ", msg, ".");
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
explicit ParquetException(Args&&... args)
|
||||
: msg_(::arrow::internal::JoinToString(std::forward<Args>(args)...)) {}
|
||||
|
||||
explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
|
||||
|
||||
explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
|
||||
|
||||
ParquetException(const ParquetException&) = default;
|
||||
ParquetException& operator=(const ParquetException&) = default;
|
||||
ParquetException(ParquetException&&) = default;
|
||||
ParquetException& operator=(ParquetException&&) = default;
|
||||
|
||||
const char* what() const noexcept override { return msg_.c_str(); }
|
||||
|
||||
private:
|
||||
std::string msg_;
|
||||
};
|
||||
|
||||
// Support printing a ParquetException.
|
||||
// This is needed for clang-on-MSVC as there operator<< is not defined for
|
||||
// std::exception.
|
||||
PARQUET_EXPORT
|
||||
std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
|
||||
|
||||
class ParquetStatusException : public ParquetException {
|
||||
public:
|
||||
explicit ParquetStatusException(::arrow::Status status)
|
||||
: ParquetException(status.ToString()), status_(std::move(status)) {}
|
||||
|
||||
const ::arrow::Status& status() const { return status_; }
|
||||
|
||||
private:
|
||||
::arrow::Status status_;
|
||||
};
|
||||
|
||||
// This class exists for the purpose of detecting an invalid or corrupted file.
|
||||
class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
|
||||
public:
|
||||
ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
|
||||
default;
|
||||
|
||||
template <typename Arg,
|
||||
typename std::enable_if<
|
||||
!std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
|
||||
int>::type = 0,
|
||||
typename... Args>
|
||||
explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
|
||||
: ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
|
||||
std::forward<Args>(args)...)) {}
|
||||
};
|
||||
|
||||
template <typename StatusReturnBlock>
|
||||
void ThrowNotOk(StatusReturnBlock&& b) {
|
||||
PARQUET_THROW_NOT_OK(b());
|
||||
}
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,257 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/io/caching.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "parquet/metadata.h" // IWYU pragma: keep
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ColumnReader;
|
||||
class FileMetaData;
|
||||
class PageIndexReader;
|
||||
class BloomFilterReader;
|
||||
class PageReader;
|
||||
class RowGroupMetaData;
|
||||
|
||||
namespace internal {
|
||||
class RecordReader;
|
||||
}
|
||||
|
||||
class PARQUET_EXPORT RowGroupReader {
|
||||
public:
|
||||
// Forward declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct Contents {
|
||||
virtual ~Contents() {}
|
||||
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
|
||||
virtual const RowGroupMetaData* metadata() const = 0;
|
||||
virtual const ReaderProperties* properties() const = 0;
|
||||
};
|
||||
|
||||
explicit RowGroupReader(std::unique_ptr<Contents> contents);
|
||||
|
||||
// Returns the rowgroup metadata
|
||||
const RowGroupMetaData* metadata() const;
|
||||
|
||||
// Construct a ColumnReader for the indicated row group-relative
|
||||
// column. Ownership is shared with the RowGroupReader.
|
||||
std::shared_ptr<ColumnReader> Column(int i);
|
||||
|
||||
// EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group.
|
||||
// Ownership is shared with the RowGroupReader.
|
||||
std::shared_ptr<internal::RecordReader> RecordReader(int i,
|
||||
bool read_dictionary = false);
|
||||
|
||||
// Construct a ColumnReader, trying to enable exposed encoding.
|
||||
//
|
||||
// For dictionary encoding, currently we only support column chunks that are fully
|
||||
// dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
|
||||
// If a column chunk uses dictionary encoding but then falls back to plain encoding, the
|
||||
// encoding will not be exposed.
|
||||
//
|
||||
// The returned column reader provides an API GetExposedEncoding() for the
|
||||
// users to check the exposed encoding and determine how to read the batches.
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
|
||||
int i, ExposedEncoding encoding_to_expose);
|
||||
|
||||
// Construct a RecordReader, trying to enable exposed encoding.
|
||||
//
|
||||
// For dictionary encoding, currently we only support column chunks that are
|
||||
// fully dictionary encoded byte arrays. The caller should verify if the reader can read
|
||||
// and expose the dictionary by checking the reader's read_dictionary(). If a column
|
||||
// chunk uses dictionary encoding but then falls back to plain encoding, the returned
|
||||
// reader will read decoded data without exposing the dictionary.
|
||||
//
|
||||
// \note API EXPERIMENTAL
|
||||
std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding(
|
||||
int i, ExposedEncoding encoding_to_expose);
|
||||
|
||||
std::unique_ptr<PageReader> GetColumnPageReader(int i);
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ParquetFileReader {
|
||||
public:
|
||||
// Declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct PARQUET_EXPORT Contents {
|
||||
static std::unique_ptr<Contents> Open(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
virtual ~Contents() = default;
|
||||
// Perform any cleanup associated with the file contents
|
||||
virtual void Close() = 0;
|
||||
virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
|
||||
virtual std::shared_ptr<FileMetaData> metadata() const = 0;
|
||||
virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0;
|
||||
virtual BloomFilterReader& GetBloomFilterReader() = 0;
|
||||
};
|
||||
|
||||
ParquetFileReader();
|
||||
~ParquetFileReader();
|
||||
|
||||
// Create a file reader instance from an Arrow file object. Thread-safety is
|
||||
// the responsibility of the file implementation
|
||||
static std::unique_ptr<ParquetFileReader> Open(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
// API Convenience to open a serialized Parquet file on disk, using Arrow IO
|
||||
// interfaces.
|
||||
static std::unique_ptr<ParquetFileReader> OpenFile(
|
||||
const std::string& path, bool memory_map = false,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
// Asynchronously open a file reader from an Arrow file object.
|
||||
// Does not throw - all errors are reported through the Future.
|
||||
static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
|
||||
std::shared_ptr<::arrow::io::RandomAccessFile> source,
|
||||
const ReaderProperties& props = default_reader_properties(),
|
||||
std::shared_ptr<FileMetaData> metadata = NULLPTR);
|
||||
|
||||
void Open(std::unique_ptr<Contents> contents);
|
||||
void Close();
|
||||
|
||||
// The RowGroupReader is owned by the FileReader
|
||||
std::shared_ptr<RowGroupReader> RowGroup(int i);
|
||||
|
||||
// Returns the file metadata. Only one instance is ever created
|
||||
std::shared_ptr<FileMetaData> metadata() const;
|
||||
|
||||
/// Returns the PageIndexReader. Only one instance is ever created.
|
||||
///
|
||||
/// If the file does not have the page index, nullptr may be returned.
|
||||
/// Because it pays to check existence of page index in the file, it
|
||||
/// is possible to return a non null value even if page index does
|
||||
/// not exist. It is the caller's responsibility to check the return
|
||||
/// value and follow-up calls to PageIndexReader.
|
||||
///
|
||||
/// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader.
|
||||
/// Initialize GetPageIndexReader() is not thread-safety.
|
||||
std::shared_ptr<PageIndexReader> GetPageIndexReader();
|
||||
|
||||
/// Returns the BloomFilterReader. Only one instance is ever created.
|
||||
///
|
||||
/// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader.
|
||||
/// Initialize GetBloomFilterReader() is not thread-safety.
|
||||
BloomFilterReader& GetBloomFilterReader();
|
||||
|
||||
/// Pre-buffer the specified column indices in all row groups.
|
||||
///
|
||||
/// Readers can optionally call this to cache the necessary slices
|
||||
/// of the file in-memory before deserialization. Arrow readers can
|
||||
/// automatically do this via an option. This is intended to
|
||||
/// increase performance when reading from high-latency filesystems
|
||||
/// (e.g. Amazon S3).
|
||||
///
|
||||
/// After calling this, creating readers for row groups/column
|
||||
/// indices that were not buffered may fail. Creating multiple
|
||||
/// readers for the a subset of the buffered regions is
|
||||
/// acceptable. This may be called again to buffer a different set
|
||||
/// of row groups/columns.
|
||||
///
|
||||
/// If memory usage is a concern, note that data will remain
|
||||
/// buffered in memory until either \a PreBuffer() is called again,
|
||||
/// or the reader itself is destructed. Reading - and buffering -
|
||||
/// only one row group at a time may be useful.
|
||||
///
|
||||
/// This method may throw.
|
||||
void PreBuffer(const std::vector<int>& row_groups,
|
||||
const std::vector<int>& column_indices,
|
||||
const ::arrow::io::IOContext& ctx,
|
||||
const ::arrow::io::CacheOptions& options);
|
||||
|
||||
/// Retrieve the list of byte ranges that would need to be read to retrieve
|
||||
/// the data for the specified row groups and column indices.
|
||||
///
|
||||
/// A reader can optionally call this if they wish to handle their own
|
||||
/// caching and management of file reads (or offload them to other readers).
|
||||
/// Unlike PreBuffer, this method will not perform any actual caching or
|
||||
/// reads, instead just using the file metadata to determine the byte ranges
|
||||
/// that would need to be read if you were to consume the entirety of the column
|
||||
/// chunks for the provided columns in the specified row groups.
|
||||
///
|
||||
/// If row_groups or column_indices are empty, then the result of this will be empty.
|
||||
///
|
||||
/// hole_size_limit represents the maximum distance, in bytes, between two
|
||||
/// consecutive ranges; beyond this value, ranges will not be combined. The default
|
||||
/// value is 1MB.
|
||||
///
|
||||
/// range_size_limit is the maximum size in bytes of a combined range; if combining
|
||||
/// two consecutive ranges would produce a range larger than this, they are not
|
||||
/// combined. The default values is 64MB. This *must* be larger than hole_size_limit.
|
||||
///
|
||||
/// This will not take into account page indexes or any other predicate push down
|
||||
/// benefits that may be available.
|
||||
::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
|
||||
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
|
||||
int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024);
|
||||
|
||||
/// Wait for the specified row groups and column indices to be pre-buffered.
|
||||
///
|
||||
/// After the returned Future completes, reading the specified row
|
||||
/// groups/columns will not block.
|
||||
///
|
||||
/// PreBuffer must be called first. This method does not throw.
|
||||
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
|
||||
const std::vector<int>& column_indices) const;
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
};
|
||||
|
||||
// Read only Parquet file metadata
|
||||
std::shared_ptr<FileMetaData> PARQUET_EXPORT
|
||||
ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
|
||||
|
||||
/// \brief Scan all values in file. Useful for performance testing
|
||||
/// \param[in] columns the column numbers to scan. If empty scans all
|
||||
/// \param[in] column_batch_size number of values to read at a time when scanning column
|
||||
/// \param[in] reader a ParquetFileReader instance
|
||||
/// \return number of semantic rows in file
|
||||
PARQUET_EXPORT
|
||||
int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
|
||||
ParquetFileReader* reader);
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,245 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "parquet/metadata.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ColumnWriter;
|
||||
|
||||
// FIXME: copied from reader-internal.cc
|
||||
static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
|
||||
static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
|
||||
|
||||
class PARQUET_EXPORT RowGroupWriter {
|
||||
public:
|
||||
// Forward declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct Contents {
|
||||
virtual ~Contents() = default;
|
||||
virtual int num_columns() const = 0;
|
||||
virtual int64_t num_rows() const = 0;
|
||||
|
||||
// to be used only with ParquetFileWriter::AppendRowGroup
|
||||
virtual ColumnWriter* NextColumn() = 0;
|
||||
// to be used only with ParquetFileWriter::AppendBufferedRowGroup
|
||||
virtual ColumnWriter* column(int i) = 0;
|
||||
|
||||
virtual int current_column() const = 0;
|
||||
virtual void Close() = 0;
|
||||
|
||||
/// \brief total uncompressed bytes written by the page writer
|
||||
virtual int64_t total_bytes_written() const = 0;
|
||||
/// \brief total bytes still compressed but not written by the page writer
|
||||
virtual int64_t total_compressed_bytes() const = 0;
|
||||
/// \brief total compressed bytes written by the page writer
|
||||
virtual int64_t total_compressed_bytes_written() const = 0;
|
||||
|
||||
virtual bool buffered() const = 0;
|
||||
};
|
||||
|
||||
explicit RowGroupWriter(std::unique_ptr<Contents> contents);
|
||||
|
||||
/// Construct a ColumnWriter for the indicated row group-relative column.
|
||||
///
|
||||
/// To be used only with ParquetFileWriter::AppendRowGroup
|
||||
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
|
||||
/// valid until the next call to NextColumn or Close. As the contents are
|
||||
/// directly written to the sink, once a new column is started, the contents
|
||||
/// of the previous one cannot be modified anymore.
|
||||
ColumnWriter* NextColumn();
|
||||
/// Index of currently written column. Equal to -1 if NextColumn()
|
||||
/// has not been called yet.
|
||||
int current_column();
|
||||
void Close();
|
||||
|
||||
int num_columns() const;
|
||||
|
||||
/// Construct a ColumnWriter for the indicated row group column.
|
||||
///
|
||||
/// To be used only with ParquetFileWriter::AppendBufferedRowGroup
|
||||
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is
|
||||
/// valid until Close. The contents are buffered in memory and written to sink
|
||||
/// on Close
|
||||
ColumnWriter* column(int i);
|
||||
|
||||
/**
|
||||
* Number of rows that shall be written as part of this RowGroup.
|
||||
*/
|
||||
int64_t num_rows() const;
|
||||
|
||||
/// \brief total uncompressed bytes written by the page writer
|
||||
int64_t total_bytes_written() const;
|
||||
/// \brief total bytes still compressed but not written by the page writer.
|
||||
/// It will always return 0 from the SerializedPageWriter.
|
||||
int64_t total_compressed_bytes() const;
|
||||
/// \brief total compressed bytes written by the page writer
|
||||
int64_t total_compressed_bytes_written() const;
|
||||
|
||||
/// Returns whether the current RowGroupWriter is in the buffered mode and is created
|
||||
/// by calling ParquetFileWriter::AppendBufferedRowGroup.
|
||||
bool buffered() const;
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
};
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteFileMetaData(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteMetaDataFile(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
|
||||
ArrowOutputStream* sink,
|
||||
const std::shared_ptr<Encryptor>& encryptor,
|
||||
bool encrypt_footer);
|
||||
|
||||
PARQUET_EXPORT
|
||||
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
|
||||
::arrow::io::OutputStream* sink,
|
||||
const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
|
||||
bool encrypt_footer = false);
|
||||
PARQUET_EXPORT
|
||||
void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
|
||||
::arrow::io::OutputStream* sink);
|
||||
|
||||
class PARQUET_EXPORT ParquetFileWriter {
|
||||
public:
|
||||
// Forward declare a virtual class 'Contents' to aid dependency injection and more
|
||||
// easily create test fixtures
|
||||
// An implementation of the Contents class is defined in the .cc file
|
||||
struct Contents {
|
||||
Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata)
|
||||
: schema_(), key_value_metadata_(std::move(key_value_metadata)) {
|
||||
schema_.Init(std::move(schema));
|
||||
}
|
||||
virtual ~Contents() {}
|
||||
// Perform any cleanup associated with the file contents
|
||||
virtual void Close() = 0;
|
||||
|
||||
virtual RowGroupWriter* AppendRowGroup() = 0;
|
||||
virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
|
||||
|
||||
virtual int64_t num_rows() const = 0;
|
||||
virtual int num_columns() const = 0;
|
||||
virtual int num_row_groups() const = 0;
|
||||
|
||||
virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
|
||||
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
|
||||
return key_value_metadata_;
|
||||
}
|
||||
|
||||
virtual void AddKeyValueMetadata(
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) = 0;
|
||||
|
||||
// Return const-pointer to make it clear that this object is not to be copied
|
||||
const SchemaDescriptor* schema() const { return &schema_; }
|
||||
|
||||
SchemaDescriptor schema_;
|
||||
|
||||
/// This should be the only place this is stored. Everything else is a const reference
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
|
||||
|
||||
const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
|
||||
std::shared_ptr<FileMetaData> file_metadata_;
|
||||
};
|
||||
|
||||
ParquetFileWriter();
|
||||
~ParquetFileWriter();
|
||||
|
||||
static std::unique_ptr<ParquetFileWriter> Open(
|
||||
std::shared_ptr<::arrow::io::OutputStream> sink,
|
||||
std::shared_ptr<schema::GroupNode> schema,
|
||||
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
|
||||
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
|
||||
|
||||
void Open(std::unique_ptr<Contents> contents);
|
||||
void Close();
|
||||
|
||||
/// Construct a RowGroupWriter with an arbitrary number of rows.
|
||||
///
|
||||
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
|
||||
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
|
||||
RowGroupWriter* AppendRowGroup();
|
||||
|
||||
/// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
|
||||
/// Use this if you want to write a RowGroup based on a certain size
|
||||
///
|
||||
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
|
||||
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
|
||||
RowGroupWriter* AppendBufferedRowGroup();
|
||||
|
||||
/// \brief Add key-value metadata to the file.
|
||||
/// \param[in] key_value_metadata the metadata to add.
|
||||
/// \note This will overwrite any existing metadata with the same key(s).
|
||||
/// \throw ParquetException if Close() has been called.
|
||||
void AddKeyValueMetadata(
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata);
|
||||
|
||||
/// Number of columns.
|
||||
///
|
||||
/// This number is fixed during the lifetime of the writer as it is determined via
|
||||
/// the schema.
|
||||
int num_columns() const;
|
||||
|
||||
/// Number of rows in the yet started RowGroups.
|
||||
///
|
||||
/// Changes on the addition of a new RowGroup.
|
||||
int64_t num_rows() const;
|
||||
|
||||
/// Number of started RowGroups.
|
||||
int num_row_groups() const;
|
||||
|
||||
/// Configuration passed to the writer, e.g. the used Parquet format version.
|
||||
const std::shared_ptr<WriterProperties>& properties() const;
|
||||
|
||||
/// Returns the file schema descriptor
|
||||
const SchemaDescriptor* schema() const;
|
||||
|
||||
/// Returns a column descriptor in schema
|
||||
const ColumnDescriptor* descr(int i) const;
|
||||
|
||||
/// Returns the file custom metadata
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
|
||||
|
||||
/// Returns the file metadata, only available after calling Close().
|
||||
const std::shared_ptr<FileMetaData> metadata() const;
|
||||
|
||||
private:
|
||||
// Holds a pointer to an instance of Contents implementation
|
||||
std::unique_ptr<Contents> contents_;
|
||||
std::shared_ptr<FileMetaData> file_metadata_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,198 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet::geospatial {
|
||||
|
||||
/// \brief The maximum number of dimensions represented by a geospatial type
|
||||
/// (i.e., X, Y, Z, and M)
|
||||
inline constexpr int kMaxDimensions = 4;
|
||||
|
||||
/// \brief NaN, used to represent bounds for which predicate pushdown cannnot
|
||||
/// be applied (e.g., because a writer did not provide bounds for a given dimension)
|
||||
inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN();
|
||||
|
||||
/// \brief Structure represented encoded statistics to be written to and read from Parquet
|
||||
/// serialized metadata.
|
||||
///
|
||||
/// See the Parquet Thrift definition and GeoStatistics for the specific definition
|
||||
/// of field values.
|
||||
struct PARQUET_EXPORT EncodedGeoStatistics {
|
||||
bool xy_bounds_present{false};
|
||||
double xmin{kNaN};
|
||||
double xmax{kNaN};
|
||||
double ymin{kNaN};
|
||||
double ymax{kNaN};
|
||||
|
||||
bool z_bounds_present{false};
|
||||
double zmin{kNaN};
|
||||
double zmax{kNaN};
|
||||
|
||||
bool m_bounds_present{false};
|
||||
double mmin{kNaN};
|
||||
double mmax{kNaN};
|
||||
|
||||
bool geospatial_types_present() const { return !geospatial_types.empty(); }
|
||||
std::vector<int32_t> geospatial_types;
|
||||
};
|
||||
|
||||
class GeoStatisticsImpl;
|
||||
|
||||
/// \brief Base type for computing geospatial column statistics while writing a file
|
||||
/// or representing them when reading a file
|
||||
///
|
||||
/// These statistics track the minimum and maximum value (omitting NaN values) of the
|
||||
/// four possible dimensions (X, Y, Z, and M) and the distinct set of geometry
|
||||
/// type/dimension combinations (e.g., point XY, linestring XYZM) present in the data.
|
||||
/// Any of these individual components may be "invalid": for example, when reading a
|
||||
/// Parquet file, information about individual components obtained from the column
|
||||
/// chunk metadata may have been missing or deemed unusable. Orthogonally,
|
||||
/// any of these individual components may be "empty": for example, when using
|
||||
/// GeoStatistics to accumulate bounds whilst writing, if all geometries in a column chunk
|
||||
/// are null, all ranges (X, Y, Z, and M) will be empty. If all geometries in a column
|
||||
/// chunk contain only XY coordinates (the most common case), the Z and M ranges will
|
||||
/// be empty but the X and Y ranges will contain finite bounds. Empty ranges are
|
||||
/// considered "valid" because they are known to represent exactly zero values (in
|
||||
/// contrast to an invalid range, whose contents is completely unknown). These concepts
|
||||
/// are all necessary for this object to accurately represent (1) accumulated or partially
|
||||
/// accumulated statistics during the writing process and (2) deserialized statistics read
|
||||
/// from the column chunk metadata during the reading process.
|
||||
///
|
||||
/// EXPERIMENTAL
|
||||
class PARQUET_EXPORT GeoStatistics {
|
||||
public:
|
||||
GeoStatistics();
|
||||
explicit GeoStatistics(const EncodedGeoStatistics& encoded);
|
||||
|
||||
~GeoStatistics();
|
||||
|
||||
/// \brief Return true if bounds, geometry types, and validity are identical
|
||||
bool Equals(const GeoStatistics& other) const;
|
||||
|
||||
/// \brief Update these statistics based on previously calculated or decoded statistics
|
||||
///
|
||||
/// Merging statistics with wraparound X values is not currently supported. Merging
|
||||
/// two GeoStatistics where one or both has a wraparound X range will result in these
|
||||
/// statistics having an X dimension marked as invalid.
|
||||
void Merge(const GeoStatistics& other);
|
||||
|
||||
/// \brief Update these statistics based on values
|
||||
void Update(const ByteArray* values, int64_t num_values);
|
||||
|
||||
/// \brief Update these statistics based on the non-null elements of values
|
||||
void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset, int64_t num_spaced_values,
|
||||
int64_t num_values);
|
||||
|
||||
/// \brief Update these statistics based on the non-null elements of values
|
||||
///
|
||||
/// Currently, BinaryArray and LargeBinaryArray input is supported.
|
||||
void Update(const ::arrow::Array& values);
|
||||
|
||||
/// \brief Return these statistics to an empty state
|
||||
void Reset();
|
||||
|
||||
/// \brief Encode the statistics for serializing to Thrift
|
||||
///
|
||||
/// If invalid WKB was encountered or if the statistics contain NaN
|
||||
/// for any reason, Encode() will return nullopt to indicate that
|
||||
/// statistics should not be written to thrift.
|
||||
std::optional<EncodedGeoStatistics> Encode() const;
|
||||
|
||||
/// \brief Returns false if invalid WKB was encountered
|
||||
bool is_valid() const;
|
||||
|
||||
/// \brief Reset existing statistics and populate them from previously-encoded ones
|
||||
void Decode(const EncodedGeoStatistics& encoded);
|
||||
|
||||
/// \brief Minimum values in XYZM order
|
||||
///
|
||||
/// For dimensions where dimension_valid() is false, the value will be NaN. For
|
||||
/// dimensions where dimension_empty() is true, the value will be +Inf.
|
||||
///
|
||||
/// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
|
||||
/// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
|
||||
/// xmin]. This implementation does not yet generate these types of bounds but they may
|
||||
/// be encountered in statistics when reading a Parquet file.
|
||||
std::array<double, kMaxDimensions> lower_bound() const;
|
||||
|
||||
/// \brief Maximum values in XYZM order
|
||||
///
|
||||
/// For dimensions where dimension_valid() is false, the value will be NaN. For
|
||||
/// dimensions where dimension_empty() is true, the value will be -Inf.
|
||||
///
|
||||
/// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
|
||||
/// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
|
||||
/// xmin]. This implementation does not yet generate these types of bounds but they may
|
||||
/// be encountered in statistics when reading a Parquet file.
|
||||
std::array<double, kMaxDimensions> upper_bound() const;
|
||||
|
||||
/// \brief Dimension emptiness in XYZM order
|
||||
///
|
||||
/// True for a given dimension if and only if zero non-NaN values were encountered
|
||||
/// in that dimension and dimension_valid() is true for that dimension.
|
||||
///
|
||||
/// When calculating statistics, zero or more of these values may be true because
|
||||
/// this implementation calculates bounds for all dimensions; however, it may be
|
||||
/// true that zero coordinates were encountered in a given dimension. For example,
|
||||
/// dimension_empty() will return four true values if Update() was not called
|
||||
/// or if Update() was called with only null values. If Update() was provided
|
||||
/// one or more geometries with X and Y dimensions but not Z or M dimensions,
|
||||
/// dimension_empty() will return true, true, false, false.
|
||||
///
|
||||
/// For statistics read from a Parquet file, dimension_empty() will always contain
|
||||
/// false values because there is no mechanism to communicate an empty interval
|
||||
/// in the Thrift metadata.
|
||||
std::array<bool, kMaxDimensions> dimension_empty() const;
|
||||
|
||||
/// \brief Dimension validity (i.e. presence) in XYZM order
|
||||
///
|
||||
/// When calculating statistics, this will always be true because this implementation
|
||||
/// calculates statistics for all dimensions. When reading a Parquet file, one or more
|
||||
/// of these values may be false because the file may not have provided bounds for all
|
||||
/// dimensions.
|
||||
///
|
||||
/// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the
|
||||
/// canonical values of those outputs for the dimensions where dimension_valid() is
|
||||
/// false.
|
||||
std::array<bool, kMaxDimensions> dimension_valid() const;
|
||||
|
||||
/// \brief Return the geometry type codes
|
||||
///
|
||||
/// This implementation always returns sorted output with no duplicates. When
|
||||
/// calculating statistics, a value will always be returned (although the returned
|
||||
/// vector may be empty if Update() was never called or was only called with null
|
||||
/// values). When reading a Parquet file, std::nullopt may be returned because
|
||||
/// the file may not have provided this information.
|
||||
std::optional<std::vector<int32_t>> geometry_types() const;
|
||||
|
||||
/// \brief Return a string representation of these statistics
|
||||
std::string ToString() const;
|
||||
|
||||
private:
|
||||
std::unique_ptr<GeoStatisticsImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace parquet::geospatial
|
||||
@@ -0,0 +1,131 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
// Abstract class for hash
|
||||
class Hasher {
|
||||
public:
|
||||
/// Compute hash for 32 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int32_t value) const = 0;
|
||||
|
||||
/// Compute hash for 64 bits value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(int64_t value) const = 0;
|
||||
|
||||
/// Compute hash for float value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(float value) const = 0;
|
||||
|
||||
/// Compute hash for double value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(double value) const = 0;
|
||||
|
||||
/// Compute hash for Int96 value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const Int96* value) const = 0;
|
||||
|
||||
/// Compute hash for ByteArray value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value to hash.
|
||||
/// @return hash result.
|
||||
virtual uint64_t Hash(const ByteArray* value) const = 0;
|
||||
|
||||
/// Compute hash for fixed byte array value by using its plain encoding result.
|
||||
///
|
||||
/// @param value the value address.
|
||||
/// @param len the value length.
|
||||
virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
|
||||
|
||||
/// Batch compute hashes for 32 bits values by using its plain encoding result.
|
||||
///
|
||||
/// @param values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for 64 bits values by using its plain encoding result.
|
||||
///
|
||||
/// @param values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for float values by using its plain encoding result.
|
||||
///
|
||||
/// @param values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for double values by using its plain encoding result.
|
||||
///
|
||||
/// @param values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for Int96 values by using its plain encoding result.
|
||||
///
|
||||
/// @param values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for ByteArray values by using its plain encoding result.
|
||||
///
|
||||
/// @param values a pointer to the values to hash.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const ByteArray* values, int num_values,
|
||||
uint64_t* hashes) const = 0;
|
||||
|
||||
/// Batch compute hashes for fixed byte array values by using its plain encoding result.
|
||||
///
|
||||
/// @param values the value address.
|
||||
/// @param type_len the value length.
|
||||
/// @param num_values the number of values to hash.
|
||||
/// @param hashes a pointer to the output hash values, its length should be equal to
|
||||
/// num_values.
|
||||
virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values,
|
||||
uint64_t* hashes) const = 0;
|
||||
|
||||
virtual ~Hasher() = default;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,38 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet::internal {
|
||||
|
||||
/// Builds a bitmap where each set bit indicates the corresponding level is greater
|
||||
/// than rhs.
|
||||
uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
|
||||
int16_t rhs);
|
||||
|
||||
struct MinMax {
|
||||
int16_t min;
|
||||
int16_t max;
|
||||
};
|
||||
|
||||
MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
|
||||
|
||||
} // namespace parquet::internal
|
||||
@@ -0,0 +1,61 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/endian.h"
|
||||
#include "parquet/level_comparison.h"
|
||||
|
||||
// Used to make sure ODR rule isn't violated.
|
||||
#ifndef PARQUET_IMPL_NAMESPACE
|
||||
# error "PARQUET_IMPL_NAMESPACE must be defined"
|
||||
#endif
|
||||
namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
|
||||
/// Builds a bitmap by applying predicate to the level vector provided.
|
||||
///
|
||||
/// \param[in] levels Rep or def level array.
|
||||
/// \param[in] num_levels The number of levels to process (must be [0, 64])
|
||||
/// \param[in] predicate The predicate to apply (must have the signature `bool
|
||||
/// predicate(int16_t)`.
|
||||
/// \returns The bitmap using least significant "bit" ordering.
|
||||
///
|
||||
template <typename Predicate>
|
||||
inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
|
||||
Predicate predicate) {
|
||||
// Both clang and GCC can vectorize this automatically with SSE4/AVX2.
|
||||
uint64_t mask = 0;
|
||||
for (int x = 0; x < num_levels; x++) {
|
||||
mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
|
||||
}
|
||||
return ::arrow::bit_util::ToLittleEndian(mask);
|
||||
}
|
||||
|
||||
inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
|
||||
MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
|
||||
for (int x = 0; x < num_levels; x++) {
|
||||
out.min = std::min(levels[x], out.min);
|
||||
out.max = std::max(levels[x], out.max);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
|
||||
int16_t rhs) {
|
||||
return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
|
||||
}
|
||||
|
||||
} // namespace parquet::internal::PARQUET_IMPL_NAMESPACE
|
||||
@@ -0,0 +1,216 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "arrow/util/endian.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet::internal {
|
||||
|
||||
struct PARQUET_EXPORT LevelInfo {
|
||||
LevelInfo()
|
||||
: null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
|
||||
LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
|
||||
int32_t repeated_ancestor_definition_level)
|
||||
: null_slot_usage(null_slots),
|
||||
def_level(static_cast<int16_t>(definition_level)),
|
||||
rep_level(static_cast<int16_t>(repetition_level)),
|
||||
repeated_ancestor_def_level(
|
||||
static_cast<int16_t>(repeated_ancestor_definition_level)) {}
|
||||
|
||||
bool operator==(const LevelInfo& b) const {
|
||||
return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
|
||||
rep_level == b.rep_level &&
|
||||
repeated_ancestor_def_level == b.repeated_ancestor_def_level;
|
||||
}
|
||||
|
||||
bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
|
||||
|
||||
// How many slots an undefined but present (i.e. null) element in
|
||||
// parquet consumes when decoding to Arrow.
|
||||
// "Slot" is used in the same context as the Arrow specification
|
||||
// (i.e. a value holder).
|
||||
// This is only ever >1 for descendents of FixedSizeList.
|
||||
int32_t null_slot_usage = 1;
|
||||
|
||||
// The definition level at which the value for the field
|
||||
// is considered not null (definition levels greater than
|
||||
// or equal to this value indicate a not-null
|
||||
// value for the field). For list fields definition levels
|
||||
// greater than or equal to this field indicate a present,
|
||||
// possibly null, child value.
|
||||
int16_t def_level = 0;
|
||||
|
||||
// The repetition level corresponding to this element
|
||||
// or the closest repeated ancestor. Any repetition
|
||||
// level less than this indicates either a new list OR
|
||||
// an empty list (which is determined in conjunction
|
||||
// with definition levels).
|
||||
int16_t rep_level = 0;
|
||||
|
||||
// The definition level indicating the level at which the closest
|
||||
// repeated ancestor is not empty. This is used to discriminate
|
||||
// between a value less than |def_level| being null or excluded entirely.
|
||||
// For instance if we have an arrow schema like:
|
||||
// list(struct(f0: int)). Then then there are the following
|
||||
// definition levels:
|
||||
// 0 = null list
|
||||
// 1 = present but empty list.
|
||||
// 2 = a null value in the list
|
||||
// 3 = a non null struct but null integer.
|
||||
// 4 = a present integer.
|
||||
// When reconstructing, the struct and integer arrays'
|
||||
// repeated_ancestor_def_level would be 2. Any
|
||||
// def_level < 2 indicates that there isn't a corresponding
|
||||
// child value in the list.
|
||||
// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
|
||||
// has the def levels [0, 1, 2, 3, 4]. The actual
|
||||
// struct array is only of length 3: [not-set, set, set] and
|
||||
// the int array is also of length 3: [N/A, null, 1].
|
||||
//
|
||||
int16_t repeated_ancestor_def_level = 0;
|
||||
|
||||
/// Increments levels according to the cardinality of node.
|
||||
void Increment(const schema::Node& node) {
|
||||
if (node.is_repeated()) {
|
||||
IncrementRepeated();
|
||||
return;
|
||||
}
|
||||
if (node.is_optional()) {
|
||||
IncrementOptional();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// Increments level for a optional node.
|
||||
void IncrementOptional() { def_level++; }
|
||||
|
||||
/// Increments levels for the repeated node. Returns
|
||||
/// the previous ancestor_list_def_level.
|
||||
int16_t IncrementRepeated() {
|
||||
int16_t last_repeated_ancestor = repeated_ancestor_def_level;
|
||||
|
||||
// Repeated fields add both a repetition and definition level. This is used
|
||||
// to distinguish between an empty list and a list with an item in it.
|
||||
++rep_level;
|
||||
++def_level;
|
||||
// For levels >= repeated_ancestor_def_level it indicates the list was
|
||||
// non-null and had at least one element. This is important
|
||||
// for later decoding because we need to add a slot for these
|
||||
// values. for levels < current_def_level no slots are added
|
||||
// to arrays.
|
||||
repeated_ancestor_def_level = def_level;
|
||||
return last_repeated_ancestor;
|
||||
}
|
||||
|
||||
// Calculates and returns LevelInfo for a column descriptor.
|
||||
static LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
|
||||
LevelInfo level_info;
|
||||
level_info.def_level = descr->max_definition_level();
|
||||
level_info.rep_level = descr->max_repetition_level();
|
||||
|
||||
int16_t min_spaced_def_level = descr->max_definition_level();
|
||||
const ::parquet::schema::Node* node = descr->schema_node().get();
|
||||
while (node && !node->is_repeated()) {
|
||||
if (node->is_optional()) {
|
||||
min_spaced_def_level--;
|
||||
}
|
||||
node = node->parent();
|
||||
}
|
||||
level_info.repeated_ancestor_def_level = min_spaced_def_level;
|
||||
return level_info;
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
|
||||
// This print method is to silence valgrind issues. What's printed
|
||||
// is not important because all asserts happen directly on
|
||||
// members.
|
||||
os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
|
||||
<< ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
|
||||
if (levels.null_slot_usage > 1) {
|
||||
os << ", null_slot_usage=" << levels.null_slot_usage;
|
||||
}
|
||||
os << "}";
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
// Input/Output structure for reconstructed validity bitmaps.
|
||||
struct PARQUET_EXPORT ValidityBitmapInputOutput {
|
||||
// Input only.
|
||||
// The maximum number of values_read expected (actual
|
||||
// values read must be less than or equal to this value).
|
||||
// If this number is exceeded methods will throw a
|
||||
// ParquetException. Exceeding this limit indicates
|
||||
// either a corrupt or incorrectly written file.
|
||||
int64_t values_read_upper_bound = 0;
|
||||
// Output only. The number of values added to the encountered
|
||||
// (this is logically the count of the number of elements
|
||||
// for an Arrow array).
|
||||
int64_t values_read = 0;
|
||||
// Input/Output. The number of nulls encountered.
|
||||
int64_t null_count = 0;
|
||||
// Output only. The validity bitmap to populate. Maybe be null only
|
||||
// for DefRepLevelsToListInfo (if all that is needed is list offsets).
|
||||
uint8_t* valid_bits = NULLPTR;
|
||||
// Input only, offset into valid_bits to start at.
|
||||
int64_t valid_bits_offset = 0;
|
||||
};
|
||||
|
||||
// Converts def_levels to validity bitmaps for non-list arrays and structs that have
|
||||
// at least one member that is not a list and has no list descendents.
|
||||
// For lists use DefRepLevelsToList and structs where all descendants contain
|
||||
// a list use DefRepLevelsToBitmap.
|
||||
void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output);
|
||||
|
||||
// Reconstructs a validity bitmap and list offsets for a list arrays based on
|
||||
// def/rep levels. The first element of offsets will not be modified if rep_levels
|
||||
// starts with a new list. The first element of offsets will be used when calculating
|
||||
// the next offset. See documentation onf DefLevelsToBitmap for when to use this
|
||||
// method vs the other ones in this file for reconstruction.
|
||||
//
|
||||
// Offsets must be sized to 1 + values_read_upper_bound.
|
||||
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
|
||||
const int16_t* rep_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output,
|
||||
int32_t* offsets);
|
||||
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
|
||||
const int16_t* rep_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output,
|
||||
int64_t* offsets);
|
||||
|
||||
// Reconstructs a validity bitmap for a struct every member is a list or has
|
||||
// a list descendant. See documentation on DefLevelsToBitmap for when more
|
||||
// details on this method compared to the other ones defined above.
|
||||
void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
|
||||
const int16_t* rep_levels,
|
||||
int64_t num_def_levels, LevelInfo level_info,
|
||||
ValidityBitmapInputOutput* output);
|
||||
|
||||
// This is exposed to ensure we can properly test a software simulated pext function
|
||||
// (i.e. it isn't hidden by runtime dispatch).
|
||||
uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
|
||||
|
||||
} // namespace parquet::internal
|
||||
@@ -0,0 +1,354 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
#pragma once
|
||||
|
||||
#include "parquet/level_conversion.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
#include "arrow/util/bit_run_reader.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_writer.h"
|
||||
#include "arrow/util/logging.h"
|
||||
#include "arrow/util/simd.h"
|
||||
#include "parquet/exception.h"
|
||||
#include "parquet/level_comparison.h"
|
||||
|
||||
#ifndef PARQUET_IMPL_NAMESPACE
|
||||
# error "PARQUET_IMPL_NAMESPACE must be defined"
|
||||
#endif
|
||||
|
||||
namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
|
||||
|
||||
// clang-format off
|
||||
/* Python code to generate lookup table:
|
||||
|
||||
kLookupBits = 5
|
||||
count = 0
|
||||
print('constexpr int kLookupBits = {};'.format(kLookupBits))
|
||||
print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
|
||||
print(' ', end = '')
|
||||
for mask in range(1 << kLookupBits):
|
||||
for data in range(1 << kLookupBits):
|
||||
bit_value = 0
|
||||
bit_len = 0
|
||||
for i in range(kLookupBits):
|
||||
if mask & (1 << i):
|
||||
bit_value |= (((data >> i) & 1) << bit_len)
|
||||
bit_len += 1
|
||||
out = '0x{:02X},'.format(bit_value)
|
||||
count += 1
|
||||
if count % (1 << kLookupBits) == 1:
|
||||
print(' {')
|
||||
if count % 8 == 1:
|
||||
print(' ', end = '')
|
||||
if count % 8 == 0:
|
||||
print(out, end = '\n')
|
||||
else:
|
||||
print(out, end = ' ')
|
||||
if count % (1 << kLookupBits) == 0:
|
||||
print(' },', end = '')
|
||||
print('\n};')
|
||||
|
||||
*/
|
||||
// clang-format on
|
||||
|
||||
constexpr int kLookupBits = 5;
|
||||
constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
|
||||
0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
|
||||
0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
|
||||
0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
|
||||
0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
|
||||
0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
|
||||
0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
|
||||
0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
|
||||
0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
|
||||
0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
|
||||
0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
|
||||
0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
|
||||
0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
|
||||
0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
|
||||
0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
|
||||
0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
|
||||
0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
|
||||
0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
|
||||
0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
|
||||
0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
|
||||
0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
|
||||
0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
|
||||
0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
|
||||
0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
|
||||
0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
|
||||
0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
|
||||
0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
|
||||
0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
|
||||
0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
|
||||
0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
|
||||
0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
|
||||
0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
|
||||
0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
|
||||
0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
|
||||
0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
|
||||
0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
|
||||
0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
|
||||
0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
|
||||
0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
|
||||
0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
|
||||
0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
|
||||
0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
|
||||
0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
||||
0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
|
||||
0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
|
||||
0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
|
||||
0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
|
||||
0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
|
||||
0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
|
||||
0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
|
||||
0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
|
||||
0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
|
||||
0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
|
||||
0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
|
||||
0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
|
||||
0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
|
||||
},
|
||||
{
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
|
||||
0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
|
||||
0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
|
||||
},
|
||||
};
|
||||
|
||||
inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
|
||||
// A software emulation of _pext_u64
|
||||
|
||||
// These checks should be inline and are likely to be common cases.
|
||||
if (select_bitmap == ~uint64_t{0}) {
|
||||
return bitmap;
|
||||
} else if (select_bitmap == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Fallback to lookup table method
|
||||
uint64_t bit_value = 0;
|
||||
int bit_len = 0;
|
||||
constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
|
||||
while (select_bitmap != 0) {
|
||||
const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
|
||||
const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
|
||||
bit_value |= (value << bit_len);
|
||||
bit_len += mask_len;
|
||||
bitmap >>= kLookupBits;
|
||||
select_bitmap >>= kLookupBits;
|
||||
}
|
||||
return bit_value;
|
||||
}
|
||||
|
||||
#ifdef ARROW_HAVE_BMI2
|
||||
|
||||
// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
|
||||
# if UINTPTR_MAX == 0xFFFFFFFF
|
||||
|
||||
using extract_bitmap_t = uint32_t;
|
||||
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
|
||||
extract_bitmap_t select_bitmap) {
|
||||
return _pext_u32(bitmap, select_bitmap);
|
||||
}
|
||||
|
||||
# else
|
||||
|
||||
using extract_bitmap_t = uint64_t;
|
||||
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
|
||||
extract_bitmap_t select_bitmap) {
|
||||
return _pext_u64(bitmap, select_bitmap);
|
||||
}
|
||||
|
||||
# endif
|
||||
|
||||
#else // !defined(ARROW_HAVE_BMI2)
|
||||
|
||||
// Use 64-bit pext emulation when BMI2 isn't available.
|
||||
using extract_bitmap_t = uint64_t;
|
||||
inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
|
||||
extract_bitmap_t select_bitmap) {
|
||||
return ExtractBitsSoftware(bitmap, select_bitmap);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
|
||||
|
||||
template <bool has_repeated_parent>
|
||||
int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
|
||||
int64_t upper_bound_remaining, LevelInfo level_info,
|
||||
::arrow::internal::FirstTimeBitmapWriter* writer) {
|
||||
ARROW_DCHECK_LE(batch_size, kExtractBitsSize);
|
||||
|
||||
// Greater than level_info.def_level - 1 implies >= the def_level
|
||||
auto defined_bitmap = static_cast<extract_bitmap_t>(
|
||||
internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
|
||||
|
||||
if (has_repeated_parent) {
|
||||
// Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
|
||||
// repeated_ancestor_def_level
|
||||
auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
|
||||
def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
|
||||
auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
|
||||
int64_t selected_count = ::arrow::bit_util::PopCount(present_bitmap);
|
||||
if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
|
||||
throw ParquetException("Values read exceeded upper bound");
|
||||
}
|
||||
writer->AppendWord(selected_bits, selected_count);
|
||||
return ::arrow::bit_util::PopCount(selected_bits);
|
||||
} else {
|
||||
if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
|
||||
std::stringstream ss;
|
||||
ss << "Values read exceeded upper bound";
|
||||
throw ParquetException(ss.str());
|
||||
}
|
||||
|
||||
writer->AppendWord(defined_bitmap, batch_size);
|
||||
return ::arrow::bit_util::PopCount(defined_bitmap);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool has_repeated_parent>
|
||||
void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
|
||||
LevelInfo level_info, ValidityBitmapInputOutput* output) {
|
||||
::arrow::internal::FirstTimeBitmapWriter writer(
|
||||
output->valid_bits,
|
||||
/*start_offset=*/output->valid_bits_offset,
|
||||
/*length=*/output->values_read_upper_bound);
|
||||
int64_t set_count = 0;
|
||||
output->values_read = 0;
|
||||
int64_t values_read_remaining = output->values_read_upper_bound;
|
||||
while (num_def_levels > kExtractBitsSize) {
|
||||
set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
|
||||
def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
|
||||
def_levels += kExtractBitsSize;
|
||||
num_def_levels -= kExtractBitsSize;
|
||||
values_read_remaining = output->values_read_upper_bound - writer.position();
|
||||
}
|
||||
set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
|
||||
def_levels, num_def_levels, values_read_remaining, level_info, &writer);
|
||||
|
||||
output->values_read = writer.position();
|
||||
output->null_count += output->values_read - set_count;
|
||||
writer.Finish();
|
||||
}
|
||||
|
||||
} // namespace parquet::internal::PARQUET_IMPL_NAMESPACE
|
||||
@@ -0,0 +1,554 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/encryption/type_fwd.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/properties.h"
|
||||
#include "parquet/type_fwd.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
using KeyValueMetadata = ::arrow::KeyValueMetadata;
|
||||
|
||||
class PARQUET_EXPORT ApplicationVersion {
|
||||
public:
|
||||
// Known Versions with Issues
|
||||
static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
|
||||
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
|
||||
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
|
||||
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
|
||||
static const ApplicationVersion& PARQUET_CPP_10353_FIXED_VERSION();
|
||||
|
||||
// Application that wrote the file. e.g. "IMPALA"
|
||||
std::string application_;
|
||||
// Build name
|
||||
std::string build_;
|
||||
|
||||
// Version of the application that wrote the file, expressed as
|
||||
// (<major>.<minor>.<patch>). Unmatched parts default to 0.
|
||||
// "1.2.3" => {1, 2, 3}
|
||||
// "1.2" => {1, 2, 0}
|
||||
// "1.2-cdh5" => {1, 2, 0}
|
||||
struct {
|
||||
int major;
|
||||
int minor;
|
||||
int patch;
|
||||
std::string unknown;
|
||||
std::string pre_release;
|
||||
std::string build_info;
|
||||
} version;
|
||||
|
||||
ApplicationVersion() = default;
|
||||
explicit ApplicationVersion(const std::string& created_by);
|
||||
ApplicationVersion(std::string application, int major, int minor, int patch);
|
||||
|
||||
// Returns true if version is strictly less than other_version
|
||||
bool VersionLt(const ApplicationVersion& other_version) const;
|
||||
|
||||
// Returns true if version is strictly equal with other_version
|
||||
bool VersionEq(const ApplicationVersion& other_version) const;
|
||||
|
||||
// Checks if the Version has the correct statistics for a given column
|
||||
bool HasCorrectStatistics(Type::type primitive, const EncodedStatistics& statistics,
|
||||
SortOrder::type sort_order = SortOrder::SIGNED) const;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnCryptoMetaData {
|
||||
public:
|
||||
static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
|
||||
~ColumnCryptoMetaData();
|
||||
|
||||
bool Equals(const ColumnCryptoMetaData& other) const;
|
||||
|
||||
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
|
||||
bool encrypted_with_footer_key() const;
|
||||
const std::string& key_metadata() const;
|
||||
|
||||
private:
|
||||
explicit ColumnCryptoMetaData(const uint8_t* metadata);
|
||||
|
||||
class ColumnCryptoMetaDataImpl;
|
||||
std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
|
||||
struct PageEncodingStats {
|
||||
PageType::type page_type;
|
||||
Encoding::type encoding;
|
||||
int32_t count;
|
||||
};
|
||||
|
||||
/// \brief Public struct for location to page index in ColumnChunkMetaData.
|
||||
struct IndexLocation {
|
||||
/// File offset of the given index, in bytes
|
||||
int64_t offset;
|
||||
/// Length of the given index, in bytes
|
||||
int32_t length;
|
||||
};
|
||||
|
||||
/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
|
||||
class PARQUET_EXPORT ColumnChunkMetaData {
|
||||
public:
|
||||
// API convenience to get a MetaData accessor
|
||||
static std::unique_ptr<ColumnChunkMetaData> Make(
|
||||
const void* metadata, const ColumnDescriptor* descr,
|
||||
const ReaderProperties& properties = default_reader_properties(),
|
||||
const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
|
||||
int16_t column_ordinal = -1,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
~ColumnChunkMetaData();
|
||||
|
||||
bool Equals(const ColumnChunkMetaData& other) const;
|
||||
|
||||
// Byte offset of `ColumnMetaData` in `file_path()`.
|
||||
//
|
||||
// Note that the meaning of this field has been inconsistent among implementations
|
||||
// so its use has since been deprecated in the Parquet specification. Modern
|
||||
// implementations will set this to `0` to indicate that the `ColumnMetaData` is solely
|
||||
// contained in the `ColumnChunk` struct.
|
||||
int64_t file_offset() const;
|
||||
|
||||
// parameter is only used when a dataset is spread across multiple files
|
||||
const std::string& file_path() const;
|
||||
|
||||
// column metadata
|
||||
bool is_metadata_set() const;
|
||||
Type::type type() const;
|
||||
int64_t num_values() const;
|
||||
std::shared_ptr<schema::ColumnPath> path_in_schema() const;
|
||||
bool is_stats_set() const;
|
||||
bool is_geo_stats_set() const;
|
||||
std::shared_ptr<Statistics> statistics() const;
|
||||
std::shared_ptr<EncodedStatistics> encoded_statistics() const;
|
||||
std::shared_ptr<SizeStatistics> size_statistics() const;
|
||||
std::shared_ptr<geospatial::GeoStatistics> geo_statistics() const;
|
||||
|
||||
Compression::type compression() const;
|
||||
// Indicate if the ColumnChunk compression is supported by the current
|
||||
// compiled parquet library.
|
||||
bool can_decompress() const;
|
||||
|
||||
const std::vector<Encoding::type>& encodings() const;
|
||||
const std::vector<PageEncodingStats>& encoding_stats() const;
|
||||
std::optional<int64_t> bloom_filter_offset() const;
|
||||
std::optional<int64_t> bloom_filter_length() const;
|
||||
bool has_dictionary_page() const;
|
||||
int64_t dictionary_page_offset() const;
|
||||
int64_t data_page_offset() const;
|
||||
bool has_index_page() const;
|
||||
int64_t index_page_offset() const;
|
||||
int64_t total_compressed_size() const;
|
||||
int64_t total_uncompressed_size() const;
|
||||
std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
|
||||
std::optional<IndexLocation> GetColumnIndexLocation() const;
|
||||
std::optional<IndexLocation> GetOffsetIndexLocation() const;
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
|
||||
|
||||
private:
|
||||
explicit ColumnChunkMetaData(
|
||||
const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
|
||||
int16_t column_ordinal, const ReaderProperties& properties,
|
||||
const ApplicationVersion* writer_version = NULLPTR,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
// PIMPL Idiom
|
||||
class ColumnChunkMetaDataImpl;
|
||||
std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
|
||||
class PARQUET_EXPORT RowGroupMetaData {
|
||||
public:
|
||||
/// \brief Create a RowGroupMetaData from a serialized thrift message.
|
||||
static std::unique_ptr<RowGroupMetaData> Make(
|
||||
const void* metadata, const SchemaDescriptor* schema,
|
||||
const ReaderProperties& properties = default_reader_properties(),
|
||||
const ApplicationVersion* writer_version = NULLPTR,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
~RowGroupMetaData();
|
||||
|
||||
bool Equals(const RowGroupMetaData& other) const;
|
||||
|
||||
/// \brief The number of columns in this row group. The order must match the
|
||||
/// parent's column ordering.
|
||||
int num_columns() const;
|
||||
|
||||
/// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
|
||||
///
|
||||
/// WARNING, the returned object references memory location in it's parent
|
||||
/// (RowGroupMetaData) object. Hence, the parent must outlive the returned
|
||||
/// object.
|
||||
///
|
||||
/// \param[in] index of the ColumnChunkMetaData to retrieve.
|
||||
///
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
|
||||
|
||||
/// \brief Number of rows in this row group.
|
||||
int64_t num_rows() const;
|
||||
|
||||
/// \brief Total byte size of all the uncompressed column data in this row group.
|
||||
int64_t total_byte_size() const;
|
||||
|
||||
/// \brief Total byte size of all the compressed (and potentially encrypted)
|
||||
/// column data in this row group.
|
||||
///
|
||||
/// This information is optional and may be 0 if omitted.
|
||||
int64_t total_compressed_size() const;
|
||||
|
||||
/// \brief Byte offset from beginning of file to first page (data or
|
||||
/// dictionary) in this row group
|
||||
///
|
||||
/// The file_offset field that this method exposes is optional. This method
|
||||
/// will return 0 if that field is not set to a meaningful value.
|
||||
int64_t file_offset() const;
|
||||
// Return const-pointer to make it clear that this object is not to be copied
|
||||
const SchemaDescriptor* schema() const;
|
||||
// Indicate if all of the RowGroup's ColumnChunks can be decompressed.
|
||||
bool can_decompress() const;
|
||||
// Sorting columns of the row group if any.
|
||||
std::vector<SortingColumn> sorting_columns() const;
|
||||
|
||||
private:
|
||||
explicit RowGroupMetaData(
|
||||
const void* metadata, const SchemaDescriptor* schema,
|
||||
const ReaderProperties& properties,
|
||||
const ApplicationVersion* writer_version = NULLPTR,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
// PIMPL Idiom
|
||||
class RowGroupMetaDataImpl;
|
||||
std::unique_ptr<RowGroupMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
class FileMetaDataBuilder;
|
||||
|
||||
/// \brief FileMetaData is a proxy around format::FileMetaData.
|
||||
class PARQUET_EXPORT FileMetaData {
|
||||
public:
|
||||
/// \brief Create a FileMetaData from a serialized thrift message.
|
||||
static std::shared_ptr<FileMetaData> Make(
|
||||
const void* serialized_metadata, uint32_t* inout_metadata_len,
|
||||
const ReaderProperties& properties = default_reader_properties(),
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
~FileMetaData();
|
||||
|
||||
bool Equals(const FileMetaData& other) const;
|
||||
|
||||
/// \brief The number of parquet "leaf" columns.
|
||||
///
|
||||
/// Parquet thrift definition requires that nested schema elements are
|
||||
/// flattened. This method returns the number of columns in the flattened
|
||||
/// version.
|
||||
/// For instance, if the schema looks like this :
|
||||
/// 0 foo.bar
|
||||
/// foo.bar.baz 0
|
||||
/// foo.bar.baz2 1
|
||||
/// foo.qux 2
|
||||
/// 1 foo2 3
|
||||
/// 2 foo3 4
|
||||
/// This method will return 5, because there are 5 "leaf" fields (so 5
|
||||
/// flattened fields)
|
||||
int num_columns() const;
|
||||
|
||||
/// \brief The number of flattened schema elements.
|
||||
///
|
||||
/// Parquet thrift definition requires that nested schema elements are
|
||||
/// flattened. This method returns the total number of elements in the
|
||||
/// flattened list.
|
||||
int num_schema_elements() const;
|
||||
|
||||
/// \brief The total number of rows.
|
||||
///
|
||||
/// If the FileMetaData was obtained by calling `SubSet()`, this is the total
|
||||
/// number of rows in the selected row groups.
|
||||
int64_t num_rows() const;
|
||||
|
||||
/// \brief The number of row groups in the file.
|
||||
///
|
||||
/// If the FileMetaData was obtained by calling `SubSet()`, this is the number
|
||||
/// of selected row groups.
|
||||
int num_row_groups() const;
|
||||
|
||||
/// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
|
||||
///
|
||||
/// WARNING, the returned object references memory location in it's parent
|
||||
/// (FileMetaData) object. Hence, the parent must outlive the returned object.
|
||||
///
|
||||
/// \param[in] index of the RowGroup to retrieve.
|
||||
///
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
|
||||
|
||||
/// \brief Return the "version" of the file
|
||||
///
|
||||
/// WARNING: The value returned by this method is unreliable as 1) the Parquet
|
||||
/// file metadata stores the version as a single integer and 2) some producers
|
||||
/// are known to always write a hardcoded value. Therefore, you cannot use
|
||||
/// this value to know which features are used in the file.
|
||||
ParquetVersion::type version() const;
|
||||
|
||||
/// \brief Return the application's user-agent string of the writer.
|
||||
const std::string& created_by() const;
|
||||
|
||||
/// \brief Return the application's version of the writer.
|
||||
const ApplicationVersion& writer_version() const;
|
||||
|
||||
/// \brief Size of the original thrift encoded metadata footer.
|
||||
uint32_t size() const;
|
||||
|
||||
/// \brief Indicate if all of the FileMetaData's RowGroups can be decompressed.
|
||||
///
|
||||
/// This will return false if any of the RowGroup's page is compressed with a
|
||||
/// compression format which is not compiled in the current parquet library.
|
||||
bool can_decompress() const;
|
||||
|
||||
bool is_encryption_algorithm_set() const;
|
||||
EncryptionAlgorithm encryption_algorithm() const;
|
||||
const std::string& footer_signing_key_metadata() const;
|
||||
|
||||
/// \brief Verify signature of FileMetaData when file is encrypted but footer
|
||||
/// is not encrypted (plaintext footer).
|
||||
bool VerifySignature(const void* signature);
|
||||
|
||||
void WriteTo(::arrow::io::OutputStream* dst,
|
||||
const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
|
||||
|
||||
/// \brief Return Thrift-serialized representation of the metadata as a
|
||||
/// string
|
||||
std::string SerializeToString() const;
|
||||
|
||||
// Return const-pointer to make it clear that this object is not to be copied
|
||||
const SchemaDescriptor* schema() const;
|
||||
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
|
||||
|
||||
/// \brief Set a path to all ColumnChunk for all RowGroups.
|
||||
///
|
||||
/// Commonly used by systems (Dask, Spark) who generates an metadata-only
|
||||
/// parquet file. The path is usually relative to said index file.
|
||||
///
|
||||
/// \param[in] path to set.
|
||||
void set_file_path(const std::string& path);
|
||||
|
||||
/// \brief Merge row groups from another metadata file into this one.
|
||||
///
|
||||
/// The schema of the input FileMetaData must be equal to the
|
||||
/// schema of this object.
|
||||
///
|
||||
/// This is used by systems who creates an aggregate metadata-only file by
|
||||
/// concatenating the row groups of multiple files. This newly created
|
||||
/// metadata file acts as an index of all available row groups.
|
||||
///
|
||||
/// \param[in] other FileMetaData to merge the row groups from.
|
||||
///
|
||||
/// \throws ParquetException if schemas are not equal.
|
||||
void AppendRowGroups(const FileMetaData& other);
|
||||
|
||||
/// \brief Return a FileMetaData containing a subset of the row groups in this
|
||||
/// FileMetaData.
|
||||
std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
|
||||
|
||||
/// \brief Serialize metadata unencrypted as string
|
||||
///
|
||||
/// \param[in] scrub whether to remove sensitive information from the metadata.
|
||||
/// \param[in] debug whether to serialize the metadata as Thrift (if false) or
|
||||
/// debug text (if true).
|
||||
std::string SerializeUnencrypted(bool scrub, bool debug) const;
|
||||
|
||||
private:
|
||||
friend FileMetaDataBuilder;
|
||||
friend class SerializedFile;
|
||||
friend class SerializedRowGroup;
|
||||
|
||||
explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
|
||||
const ReaderProperties& properties,
|
||||
std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
|
||||
|
||||
void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
|
||||
const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const;
|
||||
|
||||
// PIMPL Idiom
|
||||
FileMetaData();
|
||||
class FileMetaDataImpl;
|
||||
std::unique_ptr<FileMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileCryptoMetaData {
|
||||
public:
|
||||
// API convenience to get a MetaData accessor
|
||||
static std::shared_ptr<FileCryptoMetaData> Make(
|
||||
const uint8_t* serialized_metadata, uint32_t* metadata_len,
|
||||
const ReaderProperties& properties = default_reader_properties());
|
||||
~FileCryptoMetaData();
|
||||
|
||||
EncryptionAlgorithm encryption_algorithm() const;
|
||||
const std::string& key_metadata() const;
|
||||
|
||||
void WriteTo(::arrow::io::OutputStream* dst) const;
|
||||
|
||||
private:
|
||||
friend FileMetaDataBuilder;
|
||||
FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len,
|
||||
const ReaderProperties& properties);
|
||||
|
||||
// PIMPL Idiom
|
||||
FileCryptoMetaData();
|
||||
class FileCryptoMetaDataImpl;
|
||||
std::unique_ptr<FileCryptoMetaDataImpl> impl_;
|
||||
};
|
||||
|
||||
// Builder API
|
||||
class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
|
||||
public:
|
||||
// API convenience to get a MetaData reader
|
||||
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
|
||||
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
|
||||
|
||||
static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
|
||||
std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
|
||||
void* contents);
|
||||
|
||||
~ColumnChunkMetaDataBuilder();
|
||||
|
||||
// column chunk
|
||||
// Used when a dataset is spread across multiple files
|
||||
void set_file_path(const std::string& path);
|
||||
|
||||
// column metadata
|
||||
void SetStatistics(const EncodedStatistics& stats);
|
||||
void SetSizeStatistics(const SizeStatistics& size_stats);
|
||||
|
||||
// column geometry statistics
|
||||
void SetGeoStatistics(const geospatial::EncodedGeoStatistics& geo_stats);
|
||||
|
||||
void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata> key_value_metadata);
|
||||
|
||||
// get the column descriptor
|
||||
const ColumnDescriptor* descr() const;
|
||||
|
||||
int64_t total_compressed_size() const;
|
||||
// commit the metadata
|
||||
|
||||
void Finish(int64_t num_values, int64_t dictionary_page_offset,
|
||||
int64_t index_page_offset, int64_t data_page_offset,
|
||||
int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
|
||||
bool dictionary_fallback,
|
||||
const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
|
||||
const std::map<Encoding::type, int32_t>& data_encoding_stats_,
|
||||
const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
|
||||
|
||||
// The metadata contents, suitable for passing to ColumnChunkMetaData::Make
|
||||
const void* contents() const;
|
||||
|
||||
// For writing metadata at end of column chunk
|
||||
void WriteTo(::arrow::io::OutputStream* sink);
|
||||
|
||||
private:
|
||||
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
|
||||
const ColumnDescriptor* column);
|
||||
explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
|
||||
const ColumnDescriptor* column, void* contents);
|
||||
// PIMPL Idiom
|
||||
class ColumnChunkMetaDataBuilderImpl;
|
||||
std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT RowGroupMetaDataBuilder {
|
||||
public:
|
||||
// API convenience to get a MetaData reader
|
||||
static std::unique_ptr<RowGroupMetaDataBuilder> Make(
|
||||
std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
|
||||
void* contents);
|
||||
|
||||
~RowGroupMetaDataBuilder();
|
||||
|
||||
ColumnChunkMetaDataBuilder* NextColumnChunk();
|
||||
int num_columns();
|
||||
int64_t num_rows();
|
||||
int current_column() const;
|
||||
|
||||
void set_num_rows(int64_t num_rows);
|
||||
|
||||
// commit the metadata
|
||||
void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
|
||||
|
||||
private:
|
||||
explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
|
||||
const SchemaDescriptor* schema_, void* contents);
|
||||
// PIMPL Idiom
|
||||
class RowGroupMetaDataBuilderImpl;
|
||||
std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
|
||||
};
|
||||
|
||||
/// \brief Public struct for location to all page indexes in a parquet file.
|
||||
struct PageIndexLocation {
|
||||
/// Alias type of page index location of a row group. The index location
|
||||
/// is located by column ordinal. If the column does not have the page index,
|
||||
/// its value is set to std::nullopt.
|
||||
using RowGroupIndexLocation = std::vector<std::optional<IndexLocation>>;
|
||||
/// Alias type of page index location of a parquet file. The index location
|
||||
/// is located by the row group ordinal.
|
||||
using FileIndexLocation = std::map<size_t, RowGroupIndexLocation>;
|
||||
/// Row group column index locations which uses row group ordinal as the key.
|
||||
FileIndexLocation column_index_location;
|
||||
/// Row group offset index locations which uses row group ordinal as the key.
|
||||
FileIndexLocation offset_index_location;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT FileMetaDataBuilder {
|
||||
public:
|
||||
// API convenience to get a MetaData builder
|
||||
static std::unique_ptr<FileMetaDataBuilder> Make(
|
||||
const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props);
|
||||
|
||||
~FileMetaDataBuilder();
|
||||
|
||||
// The prior RowGroupMetaDataBuilder (if any) is destroyed
|
||||
RowGroupMetaDataBuilder* AppendRowGroup();
|
||||
|
||||
// Update location to all page indexes in the parquet file
|
||||
void SetPageIndexLocation(const PageIndexLocation& location);
|
||||
|
||||
// Complete the Thrift structure
|
||||
std::unique_ptr<FileMetaData> Finish(
|
||||
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
|
||||
|
||||
// crypto metadata
|
||||
std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
|
||||
|
||||
private:
|
||||
explicit FileMetaDataBuilder(const SchemaDescriptor* schema,
|
||||
std::shared_ptr<WriterProperties> props);
|
||||
// PIMPL Idiom
|
||||
class FileMetaDataBuilderImpl;
|
||||
std::unique_ptr<FileMetaDataBuilderImpl> impl_;
|
||||
};
|
||||
|
||||
PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,386 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "parquet/encryption/type_fwd.h"
|
||||
#include "parquet/type_fwd.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// \brief ColumnIndex is a proxy around format::ColumnIndex.
|
||||
class PARQUET_EXPORT ColumnIndex {
|
||||
public:
|
||||
/// \brief Create a ColumnIndex from a serialized thrift message.
|
||||
static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr,
|
||||
const void* serialized_index,
|
||||
uint32_t index_len,
|
||||
const ReaderProperties& properties,
|
||||
Decryptor* decryptor = NULLPTR);
|
||||
|
||||
virtual ~ColumnIndex() = default;
|
||||
|
||||
/// \brief A bitmap with a bit set for each data page that has only null values.
|
||||
///
|
||||
/// The length of this vector is equal to the number of data pages in the column.
|
||||
virtual const std::vector<bool>& null_pages() const = 0;
|
||||
|
||||
/// \brief A vector of encoded lower bounds for each data page in this column.
|
||||
///
|
||||
/// `null_pages` should be inspected first, as only pages with non-null values
|
||||
/// may have their lower bounds populated.
|
||||
virtual const std::vector<std::string>& encoded_min_values() const = 0;
|
||||
|
||||
/// \brief A vector of encoded upper bounds for each data page in this column.
|
||||
///
|
||||
/// `null_pages` should be inspected first, as only pages with non-null values
|
||||
/// may have their upper bounds populated.
|
||||
virtual const std::vector<std::string>& encoded_max_values() const = 0;
|
||||
|
||||
/// \brief The ordering of lower and upper bounds.
|
||||
///
|
||||
/// The boundary order applies across all lower bounds, and all upper bounds,
|
||||
/// respectively. However, the order between lower bounds and upper bounds
|
||||
/// cannot be derived from this.
|
||||
virtual BoundaryOrder::type boundary_order() const = 0;
|
||||
|
||||
/// \brief Whether per-page null count information is available.
|
||||
virtual bool has_null_counts() const = 0;
|
||||
|
||||
/// \brief An optional vector with the number of null values in each data page.
|
||||
///
|
||||
/// `has_null_counts` should be called first to determine if this information is
|
||||
/// available.
|
||||
virtual const std::vector<int64_t>& null_counts() const = 0;
|
||||
|
||||
/// \brief A vector of page indices for non-null pages.
|
||||
virtual const std::vector<int32_t>& non_null_page_indices() const = 0;
|
||||
|
||||
/// \brief Whether definition level histogram is available.
|
||||
virtual bool has_definition_level_histograms() const = 0;
|
||||
|
||||
/// \brief Whether repetition level histogram is available.
|
||||
virtual bool has_repetition_level_histograms() const = 0;
|
||||
|
||||
/// \brief List of definition level histograms for each page concatenated together.
|
||||
virtual const std::vector<int64_t>& definition_level_histograms() const = 0;
|
||||
|
||||
/// \brief List of repetition level histograms for each page concatenated together.
|
||||
virtual const std::vector<int64_t>& repetition_level_histograms() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Typed implementation of ColumnIndex.
|
||||
template <typename DType>
|
||||
class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
/// \brief A vector of lower bounds for each data page in this column.
|
||||
///
|
||||
/// This is like `encoded_min_values`, but with the values decoded according to
|
||||
/// the column's physical type.
|
||||
/// `min_values` and `max_values` can be used together with `boundary_order`
|
||||
/// in order to prune some data pages when searching for specific values.
|
||||
virtual const std::vector<T>& min_values() const = 0;
|
||||
|
||||
/// \brief A vector of upper bounds for each data page in this column.
|
||||
///
|
||||
/// Just like `min_values`, but for upper bounds instead of lower bounds.
|
||||
virtual const std::vector<T>& max_values() const = 0;
|
||||
};
|
||||
|
||||
using BoolColumnIndex = TypedColumnIndex<BooleanType>;
|
||||
using Int32ColumnIndex = TypedColumnIndex<Int32Type>;
|
||||
using Int64ColumnIndex = TypedColumnIndex<Int64Type>;
|
||||
using FloatColumnIndex = TypedColumnIndex<FloatType>;
|
||||
using DoubleColumnIndex = TypedColumnIndex<DoubleType>;
|
||||
using ByteArrayColumnIndex = TypedColumnIndex<ByteArrayType>;
|
||||
using FLBAColumnIndex = TypedColumnIndex<FLBAType>;
|
||||
|
||||
/// \brief PageLocation is a proxy around format::PageLocation.
|
||||
struct PARQUET_EXPORT PageLocation {
|
||||
/// File offset of the data page.
|
||||
int64_t offset;
|
||||
/// Total compressed size of the data page and header.
|
||||
int32_t compressed_page_size;
|
||||
/// Row id of the first row in the page within the row group.
|
||||
int64_t first_row_index;
|
||||
};
|
||||
|
||||
/// \brief OffsetIndex is a proxy around format::OffsetIndex.
|
||||
class PARQUET_EXPORT OffsetIndex {
|
||||
public:
|
||||
/// \brief Create a OffsetIndex from a serialized thrift message.
|
||||
static std::unique_ptr<OffsetIndex> Make(const void* serialized_index,
|
||||
uint32_t index_len,
|
||||
const ReaderProperties& properties,
|
||||
Decryptor* decryptor = NULLPTR);
|
||||
|
||||
virtual ~OffsetIndex() = default;
|
||||
|
||||
/// \brief A vector of locations for each data page in this column.
|
||||
virtual const std::vector<PageLocation>& page_locations() const = 0;
|
||||
|
||||
/// \brief A vector of unencoded/uncompressed size of each page for BYTE_ARRAY types,
|
||||
/// or empty for other types.
|
||||
virtual const std::vector<int64_t>& unencoded_byte_array_data_bytes() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Interface for reading the page index for a Parquet row group.
|
||||
class PARQUET_EXPORT RowGroupPageIndexReader {
|
||||
public:
|
||||
virtual ~RowGroupPageIndexReader() = default;
|
||||
|
||||
/// \brief Read column index of a column chunk.
|
||||
///
|
||||
/// \param[in] i column ordinal of the column chunk.
|
||||
/// \returns column index of the column or nullptr if it does not exist.
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
virtual std::shared_ptr<ColumnIndex> GetColumnIndex(int32_t i) = 0;
|
||||
|
||||
/// \brief Read offset index of a column chunk.
|
||||
///
|
||||
/// \param[in] i column ordinal of the column chunk.
|
||||
/// \returns offset index of the column or nullptr if it does not exist.
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
virtual std::shared_ptr<OffsetIndex> GetOffsetIndex(int32_t i) = 0;
|
||||
};
|
||||
|
||||
struct PageIndexSelection {
|
||||
/// Specifies whether to read the column index.
|
||||
bool column_index = false;
|
||||
/// Specifies whether to read the offset index.
|
||||
bool offset_index = false;
|
||||
};
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::ostream& operator<<(std::ostream& out, const PageIndexSelection& params);
|
||||
|
||||
struct RowGroupIndexReadRange {
|
||||
/// Base start and total size of column index of all column chunks in a row group.
|
||||
/// If none of the column chunks have column index, it is set to std::nullopt.
|
||||
std::optional<::arrow::io::ReadRange> column_index = std::nullopt;
|
||||
/// Base start and total size of offset index of all column chunks in a row group.
|
||||
/// If none of the column chunks have offset index, it is set to std::nullopt.
|
||||
std::optional<::arrow::io::ReadRange> offset_index = std::nullopt;
|
||||
};
|
||||
|
||||
/// \brief Interface for reading the page index for a Parquet file.
|
||||
class PARQUET_EXPORT PageIndexReader {
|
||||
public:
|
||||
virtual ~PageIndexReader() = default;
|
||||
|
||||
/// \brief Create a PageIndexReader instance.
|
||||
/// \returns a PageIndexReader instance.
|
||||
/// WARNING: The returned PageIndexReader references to all the input parameters, so
|
||||
/// it must not outlive all of the input parameters. Usually these input parameters
|
||||
/// come from the same ParquetFileReader object, so it must not outlive the reader
|
||||
/// that creates this PageIndexReader.
|
||||
static std::shared_ptr<PageIndexReader> Make(
|
||||
::arrow::io::RandomAccessFile* input, std::shared_ptr<FileMetaData> file_metadata,
|
||||
const ReaderProperties& properties,
|
||||
InternalFileDecryptor* file_decryptor = NULLPTR);
|
||||
|
||||
/// \brief Get the page index reader of a specific row group.
|
||||
/// \param[in] i row group ordinal to get page index reader.
|
||||
/// \returns RowGroupPageIndexReader of the specified row group. A nullptr may or may
|
||||
/// not be returned if the page index for the row group is unavailable. It is
|
||||
/// the caller's responsibility to check the return value of follow-up calls
|
||||
/// to the RowGroupPageIndexReader.
|
||||
/// \throws ParquetException if the index is out of bound.
|
||||
virtual std::shared_ptr<RowGroupPageIndexReader> RowGroup(int i) = 0;
|
||||
|
||||
/// \brief Advise the reader which part of page index will be read later.
|
||||
///
|
||||
/// The PageIndexReader can optionally prefetch and cache page index that
|
||||
/// may be read later to get better performance.
|
||||
///
|
||||
/// The contract of this function is as below:
|
||||
/// 1) If WillNeed() has not been called for a specific row group and the page index
|
||||
/// exists, follow-up calls to get column index or offset index of all columns in
|
||||
/// this row group SHOULD NOT FAIL, but the performance may not be optimal.
|
||||
/// 2) If WillNeed() has been called for a specific row group, follow-up calls to get
|
||||
/// page index are limited to columns and index type requested by WillNeed().
|
||||
/// So it MAY FAIL if columns that are not requested by WillNeed() are requested.
|
||||
/// 3) Later calls to WillNeed() MAY OVERRIDE previous calls of same row groups.
|
||||
/// For example,
|
||||
/// 1) If WillNeed() is not called for row group 0, then follow-up calls to read
|
||||
/// column index and/or offset index of all columns of row group 0 should not
|
||||
/// fail if its page index exists.
|
||||
/// 2) If WillNeed() is called for columns 0 and 1 for row group 0, then follow-up
|
||||
/// call to read page index of column 2 for row group 0 MAY FAIL even if its
|
||||
/// page index exists.
|
||||
/// 3) If WillNeed() is called for row group 0 with offset index only, then
|
||||
/// follow-up call to read column index of row group 0 MAY FAIL even if
|
||||
/// the column index of this column exists.
|
||||
/// 4) If WillNeed() is called for columns 0 and 1 for row group 0, then later
|
||||
/// call to WillNeed() for columns 1 and 2 for row group 0. The later one
|
||||
/// overrides previous call and only columns 1 and 2 of row group 0 are allowed
|
||||
/// to access.
|
||||
///
|
||||
/// \param[in] row_group_indices list of row group ordinal to read page index later.
|
||||
/// \param[in] column_indices list of column ordinal to read page index later. If it is
|
||||
/// empty, it means all columns in the row group will be read.
|
||||
/// \param[in] selection which kind of page index is required later.
|
||||
virtual void WillNeed(const std::vector<int32_t>& row_group_indices,
|
||||
const std::vector<int32_t>& column_indices,
|
||||
const PageIndexSelection& selection) = 0;
|
||||
|
||||
/// \brief Advise the reader page index of these row groups will not be read anymore.
|
||||
///
|
||||
/// The PageIndexReader implementation has the opportunity to cancel any prefetch or
|
||||
/// release resource that are related to these row groups.
|
||||
///
|
||||
/// \param[in] row_group_indices list of row group ordinal that whose page index will
|
||||
/// not be accessed anymore.
|
||||
virtual void WillNotNeed(const std::vector<int32_t>& row_group_indices) = 0;
|
||||
|
||||
/// \brief Determine the column index and offset index ranges for the given row group.
|
||||
///
|
||||
/// \param[in] row_group_metadata row group metadata to get column chunk metadata.
|
||||
/// \param[in] columns list of column ordinals to get page index. If the list is empty,
|
||||
/// it means all columns in the row group.
|
||||
/// \returns RowGroupIndexReadRange of the specified row group. Throws ParquetException
|
||||
/// if the selected column ordinal is out of bound or metadata of page index
|
||||
/// is corrupted.
|
||||
static RowGroupIndexReadRange DeterminePageIndexRangesInRowGroup(
|
||||
const RowGroupMetaData& row_group_metadata, const std::vector<int32_t>& columns);
|
||||
};
|
||||
|
||||
/// \brief Interface for collecting column index of data pages in a column chunk.
|
||||
class PARQUET_EXPORT ColumnIndexBuilder {
|
||||
public:
|
||||
/// \brief API convenience to create a ColumnIndexBuilder.
|
||||
static std::unique_ptr<ColumnIndexBuilder> Make(const ColumnDescriptor* descr);
|
||||
|
||||
virtual ~ColumnIndexBuilder() = default;
|
||||
|
||||
/// \brief Add statistics of a data page.
|
||||
///
|
||||
/// If the ColumnIndexBuilder has seen any corrupted statistics, it will
|
||||
/// not update statistics anymore.
|
||||
///
|
||||
/// \param stats Page statistics in the encoded form.
|
||||
/// \param size_stats Size statistics of the page if available.
|
||||
virtual void AddPage(const EncodedStatistics& stats,
|
||||
const SizeStatistics& size_stats) = 0;
|
||||
|
||||
/// \brief Complete the column index.
|
||||
///
|
||||
/// Once called, AddPage() can no longer be called.
|
||||
/// WriteTo() and Build() can only called after Finish() has been called.
|
||||
virtual void Finish() = 0;
|
||||
|
||||
/// \brief Serialize the column index thrift message.
|
||||
///
|
||||
/// If the ColumnIndexBuilder has seen any corrupted statistics, it will
|
||||
/// not write any data to the sink.
|
||||
///
|
||||
/// \param[out] sink output stream to write the serialized message.
|
||||
/// \param[in] encryptor encryptor to encrypt the serialized column index.
|
||||
virtual void WriteTo(::arrow::io::OutputStream* sink,
|
||||
Encryptor* encryptor = NULLPTR) const = 0;
|
||||
|
||||
/// \brief Create a ColumnIndex directly.
|
||||
///
|
||||
/// \return If the ColumnIndexBuilder has seen any corrupted statistics, it simply
|
||||
/// returns nullptr. Otherwise the column index is built and returned.
|
||||
virtual std::unique_ptr<ColumnIndex> Build() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Interface for collecting offset index of data pages in a column chunk.
|
||||
class PARQUET_EXPORT OffsetIndexBuilder {
|
||||
public:
|
||||
/// \brief API convenience to create a OffsetIndexBuilder.
|
||||
static std::unique_ptr<OffsetIndexBuilder> Make();
|
||||
|
||||
virtual ~OffsetIndexBuilder() = default;
|
||||
|
||||
/// \brief Add page location and size stats of a data page.
|
||||
virtual void AddPage(int64_t offset, int32_t compressed_page_size,
|
||||
int64_t first_row_index,
|
||||
std::optional<int64_t> unencoded_byte_array_length = {}) = 0;
|
||||
|
||||
/// \brief Add page location and size stats of a data page.
|
||||
void AddPage(const PageLocation& page_location, const SizeStatistics& size_stats);
|
||||
|
||||
/// \brief Complete the offset index.
|
||||
///
|
||||
/// In the buffered row group mode, data pages are flushed into memory
|
||||
/// sink and the OffsetIndexBuilder has only collected the relative offset
|
||||
/// which requires adjustment once they are flushed to the file.
|
||||
///
|
||||
/// \param final_position Final stream offset to add for page offset adjustment.
|
||||
virtual void Finish(int64_t final_position) = 0;
|
||||
|
||||
/// \brief Serialize the offset index thrift message.
|
||||
///
|
||||
/// \param[out] sink output stream to write the serialized message.
|
||||
/// \param[in] encryptor encryptor to encrypt the serialized offset index.
|
||||
virtual void WriteTo(::arrow::io::OutputStream* sink,
|
||||
Encryptor* encryptor = NULLPTR) const = 0;
|
||||
|
||||
/// \brief Create an OffsetIndex directly.
|
||||
virtual std::unique_ptr<OffsetIndex> Build() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Interface for collecting page index of a parquet file.
|
||||
class PARQUET_EXPORT PageIndexBuilder {
|
||||
public:
|
||||
/// \brief API convenience to create a PageIndexBuilder.
|
||||
static std::unique_ptr<PageIndexBuilder> Make(
|
||||
const SchemaDescriptor* schema, InternalFileEncryptor* file_encryptor = NULLPTR);
|
||||
|
||||
virtual ~PageIndexBuilder() = default;
|
||||
|
||||
/// \brief Start a new row group.
|
||||
virtual void AppendRowGroup() = 0;
|
||||
|
||||
/// \brief Get the ColumnIndexBuilder from column ordinal.
|
||||
///
|
||||
/// \param i Column ordinal.
|
||||
/// \return ColumnIndexBuilder for the column and its memory ownership belongs to
|
||||
/// the PageIndexBuilder.
|
||||
virtual ColumnIndexBuilder* GetColumnIndexBuilder(int32_t i) = 0;
|
||||
|
||||
/// \brief Get the OffsetIndexBuilder from column ordinal.
|
||||
///
|
||||
/// \param i Column ordinal.
|
||||
/// \return OffsetIndexBuilder for the column and its memory ownership belongs to
|
||||
/// the PageIndexBuilder.
|
||||
virtual OffsetIndexBuilder* GetOffsetIndexBuilder(int32_t i) = 0;
|
||||
|
||||
/// \brief Complete the page index builder and no more write is allowed.
|
||||
virtual void Finish() = 0;
|
||||
|
||||
/// \brief Serialize the page index thrift message.
|
||||
///
|
||||
/// Only valid column indexes and offset indexes are serialized and their locations
|
||||
/// are set.
|
||||
///
|
||||
/// \param[out] sink The output stream to write the page index.
|
||||
/// \param[out] location The location of all page index to the start of sink.
|
||||
virtual void WriteTo(::arrow::io::OutputStream* sink,
|
||||
PageIndexLocation* location) const = 0;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,31 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#ifndef PARQUET_VERSION_H
|
||||
#define PARQUET_VERSION_H
|
||||
|
||||
#define PARQUET_VERSION_MAJOR 22
|
||||
#define PARQUET_VERSION_MINOR 0
|
||||
#define PARQUET_VERSION_PATCH 0
|
||||
|
||||
#define PARQUET_SO_VERSION "2200"
|
||||
#define PARQUET_FULL_SO_VERSION "2200.0.0"
|
||||
|
||||
// define the parquet created by version
|
||||
#define CREATED_BY_VERSION "parquet-cpp-arrow version 22.0.0"
|
||||
|
||||
#endif // PARQUET_VERSION_H
|
||||
@@ -0,0 +1,112 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/buffer.h" // IWYU pragma: export
|
||||
#include "arrow/io/interfaces.h" // IWYU pragma: export
|
||||
#include "arrow/status.h" // IWYU pragma: export
|
||||
#include "arrow/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/util/macros.h" // IWYU pragma: export
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
|
||||
# if defined(_MSC_VER)
|
||||
# pragma warning(push)
|
||||
// Disable warning for STL types usage in DLL interface
|
||||
// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
|
||||
# pragma warning(disable : 4275 4251)
|
||||
// Disable diamond inheritance warnings
|
||||
# pragma warning(disable : 4250)
|
||||
// Disable macro redefinition warnings
|
||||
# pragma warning(disable : 4005)
|
||||
// Disable extern before exported template warnings
|
||||
# pragma warning(disable : 4910)
|
||||
# else
|
||||
# pragma GCC diagnostic ignored "-Wattributes"
|
||||
# endif
|
||||
|
||||
# ifdef PARQUET_STATIC
|
||||
# define PARQUET_EXPORT
|
||||
# elif defined(PARQUET_EXPORTING)
|
||||
# define PARQUET_EXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define PARQUET_EXPORT __declspec(dllimport)
|
||||
# endif
|
||||
|
||||
# define PARQUET_NO_EXPORT
|
||||
|
||||
#else // Not Windows
|
||||
# ifndef PARQUET_EXPORT
|
||||
# define PARQUET_EXPORT __attribute__((visibility("default")))
|
||||
# endif
|
||||
# ifndef PARQUET_NO_EXPORT
|
||||
# define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
# endif
|
||||
#endif // Non-Windows
|
||||
|
||||
// This is a complicated topic, some reading on it:
|
||||
// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
|
||||
#if defined(_MSC_VER) || defined(__clang__)
|
||||
# define PARQUET_TEMPLATE_CLASS_EXPORT
|
||||
# define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
|
||||
#else
|
||||
# define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
|
||||
# define PARQUET_TEMPLATE_EXPORT
|
||||
#endif
|
||||
|
||||
#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
|
||||
|
||||
#define PARQUET_NORETURN ARROW_NORETURN
|
||||
#define PARQUET_DEPRECATED ARROW_DEPRECATED
|
||||
|
||||
// If ARROW_VALGRIND set when compiling unit tests, also define
|
||||
// PARQUET_VALGRIND
|
||||
#ifdef ARROW_VALGRIND
|
||||
# define PARQUET_VALGRIND
|
||||
#endif
|
||||
|
||||
namespace parquet {
|
||||
|
||||
using Buffer = ::arrow::Buffer;
|
||||
using Codec = ::arrow::util::Codec;
|
||||
using CodecOptions = ::arrow::util::CodecOptions;
|
||||
using Compression = ::arrow::Compression;
|
||||
using MemoryPool = ::arrow::MemoryPool;
|
||||
using MutableBuffer = ::arrow::MutableBuffer;
|
||||
using ResizableBuffer = ::arrow::ResizableBuffer;
|
||||
using ResizableBuffer = ::arrow::ResizableBuffer;
|
||||
using ArrowInputFile = ::arrow::io::RandomAccessFile;
|
||||
using ArrowInputStream = ::arrow::io::InputStream;
|
||||
using ArrowOutputStream = ::arrow::io::OutputStream;
|
||||
|
||||
constexpr int64_t kDefaultOutputStreamSize = 1024;
|
||||
|
||||
constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::shared_ptr<ResizableBuffer> AllocateBuffer(
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,46 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iosfwd>
|
||||
#include <list>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ParquetFileReader;
|
||||
|
||||
class PARQUET_EXPORT ParquetFilePrinter {
|
||||
private:
|
||||
ParquetFileReader* fileReader;
|
||||
|
||||
public:
|
||||
explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
|
||||
~ParquetFilePrinter() = default;
|
||||
|
||||
void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
|
||||
bool print_values = false, bool format_dump = false,
|
||||
bool print_key_value_metadata = false,
|
||||
const char* filename = "No Name");
|
||||
|
||||
void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
|
||||
const char* filename = "No Name");
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,494 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module contains the logical parquet-cpp types (independent of Thrift
|
||||
// structures), schema nodes, and related type tools
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
#include "parquet/windows_fixup.h" // for OPTIONAL
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class SchemaDescriptor;
|
||||
|
||||
namespace schema {
|
||||
|
||||
class Node;
|
||||
|
||||
// List encodings: using the terminology from Impala to define different styles
|
||||
// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
|
||||
// the converted type named in the Parquet metadata is ConvertedType::LIST we
|
||||
// use that terminology here. It also helps distinguish from the *_ARRAY
|
||||
// primitive types.
|
||||
//
|
||||
// One-level encoding: Only allows required lists with required cells
|
||||
// repeated value_type name
|
||||
//
|
||||
// Two-level encoding: Enables optional lists with only required cells
|
||||
// <required/optional> group list
|
||||
// repeated value_type item
|
||||
//
|
||||
// Three-level encoding: Enables optional lists with optional cells
|
||||
// <required/optional> group bag
|
||||
// repeated group list
|
||||
// <required/optional> value_type item
|
||||
//
|
||||
// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
|
||||
// the non-repeated nodes set to required.
|
||||
//
|
||||
// The "official" encoding recommended in the Parquet spec is the 3-level, and
|
||||
// we use that as the default when creating list types. For semantic completeness
|
||||
// we allow the other two. Since all types of encodings will occur "in the
|
||||
// wild" we need to be able to interpret the associated definition levels in
|
||||
// the context of the actual encoding used in the file.
|
||||
//
|
||||
// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
|
||||
// SchemaElement, which could make things challenging if we are trying to infer
|
||||
// that a sequence of nodes semantically represents an array according to one
|
||||
// of these encodings (versus a struct containing an array). We should refuse
|
||||
// the temptation to guess, as they say.
|
||||
struct ListEncoding {
|
||||
enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT ColumnPath {
|
||||
public:
|
||||
ColumnPath() : path_() {}
|
||||
explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
|
||||
explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
|
||||
|
||||
static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
|
||||
static std::shared_ptr<ColumnPath> FromNode(const Node& node);
|
||||
|
||||
std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
|
||||
std::string ToDotString() const;
|
||||
const std::vector<std::string>& ToDotVector() const;
|
||||
|
||||
protected:
|
||||
std::vector<std::string> path_;
|
||||
};
|
||||
|
||||
// Base class for logical schema types. A type has a name, repetition level,
|
||||
// and optionally a logical type (ConvertedType in Parquet metadata parlance)
|
||||
class PARQUET_EXPORT Node {
|
||||
public:
|
||||
enum type { PRIMITIVE, GROUP };
|
||||
|
||||
virtual ~Node() {}
|
||||
|
||||
bool is_primitive() const { return type_ == Node::PRIMITIVE; }
|
||||
|
||||
bool is_group() const { return type_ == Node::GROUP; }
|
||||
|
||||
bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
|
||||
|
||||
bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
|
||||
|
||||
bool is_required() const { return repetition_ == Repetition::REQUIRED; }
|
||||
|
||||
virtual bool Equals(const Node* other) const = 0;
|
||||
|
||||
const std::string& name() const { return name_; }
|
||||
|
||||
Node::type node_type() const { return type_; }
|
||||
|
||||
Repetition::type repetition() const { return repetition_; }
|
||||
|
||||
ConvertedType::type converted_type() const { return converted_type_; }
|
||||
|
||||
const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
|
||||
|
||||
/// \brief The field_id value for the serialized SchemaElement. If the
|
||||
/// field_id is less than 0 (e.g. -1), it will not be set when serialized to
|
||||
/// Thrift.
|
||||
int field_id() const { return field_id_; }
|
||||
|
||||
const Node* parent() const { return parent_; }
|
||||
|
||||
const std::shared_ptr<ColumnPath> path() const;
|
||||
|
||||
virtual void ToParquet(void* element) const = 0;
|
||||
|
||||
// Node::Visitor abstract class for walking schemas with the visitor pattern
|
||||
class Visitor {
|
||||
public:
|
||||
virtual ~Visitor() {}
|
||||
|
||||
virtual void Visit(Node* node) = 0;
|
||||
};
|
||||
class ConstVisitor {
|
||||
public:
|
||||
virtual ~ConstVisitor() {}
|
||||
|
||||
virtual void Visit(const Node* node) = 0;
|
||||
};
|
||||
|
||||
virtual void Visit(Visitor* visitor) = 0;
|
||||
virtual void VisitConst(ConstVisitor* visitor) const = 0;
|
||||
|
||||
protected:
|
||||
friend class GroupNode;
|
||||
|
||||
Node(Node::type type, const std::string& name, Repetition::type repetition,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
|
||||
: type_(type),
|
||||
name_(name),
|
||||
repetition_(repetition),
|
||||
converted_type_(converted_type),
|
||||
field_id_(field_id),
|
||||
parent_(NULLPTR) {}
|
||||
|
||||
Node(Node::type type, const std::string& name, Repetition::type repetition,
|
||||
std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
|
||||
: type_(type),
|
||||
name_(name),
|
||||
repetition_(repetition),
|
||||
logical_type_(std::move(logical_type)),
|
||||
field_id_(field_id),
|
||||
parent_(NULLPTR) {}
|
||||
|
||||
Node::type type_;
|
||||
std::string name_;
|
||||
Repetition::type repetition_;
|
||||
ConvertedType::type converted_type_{ConvertedType::NONE};
|
||||
std::shared_ptr<const LogicalType> logical_type_;
|
||||
int field_id_;
|
||||
// Nodes should not be shared, they have a single parent.
|
||||
const Node* parent_;
|
||||
|
||||
bool EqualsInternal(const Node* other) const;
|
||||
void SetParent(const Node* p_parent);
|
||||
|
||||
private:
|
||||
PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
|
||||
};
|
||||
|
||||
// Save our breath all over the place with these typedefs
|
||||
using NodePtr = std::shared_ptr<Node>;
|
||||
using NodeVector = std::vector<NodePtr>;
|
||||
|
||||
// A type that is one of the primitive Parquet storage types. In addition to
|
||||
// the other type metadata (name, repetition level, logical type), also has the
|
||||
// physical storage type and their type-specific metadata (byte width, decimal
|
||||
// parameters)
|
||||
class PARQUET_EXPORT PrimitiveNode : public Node {
|
||||
public:
|
||||
static std::unique_ptr<Node> FromParquet(const void* opaque_element);
|
||||
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
Type::type type,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE,
|
||||
int length = -1, int precision = -1, int scale = -1,
|
||||
int field_id = -1) {
|
||||
return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
|
||||
precision, scale, field_id));
|
||||
}
|
||||
|
||||
// If no logical type, pass LogicalType::None() or nullptr
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
std::shared_ptr<const LogicalType> logical_type,
|
||||
Type::type primitive_type, int primitive_length = -1,
|
||||
int field_id = -1) {
|
||||
return NodePtr(new PrimitiveNode(name, repetition, std::move(logical_type),
|
||||
primitive_type, primitive_length, field_id));
|
||||
}
|
||||
|
||||
bool Equals(const Node* other) const override;
|
||||
|
||||
Type::type physical_type() const { return physical_type_; }
|
||||
|
||||
ColumnOrder column_order() const { return column_order_; }
|
||||
|
||||
void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
|
||||
|
||||
int32_t type_length() const { return type_length_; }
|
||||
|
||||
const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
|
||||
|
||||
void ToParquet(void* element) const override;
|
||||
void Visit(Visitor* visitor) override;
|
||||
void VisitConst(ConstVisitor* visitor) const override;
|
||||
|
||||
private:
|
||||
PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
|
||||
int precision = -1, int scale = -1, int field_id = -1);
|
||||
|
||||
PrimitiveNode(const std::string& name, Repetition::type repetition,
|
||||
std::shared_ptr<const LogicalType> logical_type,
|
||||
Type::type primitive_type, int primitive_length = -1, int field_id = -1);
|
||||
|
||||
Type::type physical_type_;
|
||||
int32_t type_length_;
|
||||
DecimalMetadata decimal_metadata_;
|
||||
ColumnOrder column_order_;
|
||||
|
||||
// For FIXED_LEN_BYTE_ARRAY
|
||||
void SetTypeLength(int32_t length) { type_length_ = length; }
|
||||
|
||||
bool EqualsInternal(const PrimitiveNode* other) const;
|
||||
|
||||
FRIEND_TEST(TestPrimitiveNode, Attrs);
|
||||
FRIEND_TEST(TestPrimitiveNode, Equals);
|
||||
FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
|
||||
FRIEND_TEST(TestPrimitiveNode, FromParquet);
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT GroupNode : public Node {
|
||||
public:
|
||||
static std::unique_ptr<Node> FromParquet(const void* opaque_element,
|
||||
NodeVector fields = {});
|
||||
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE,
|
||||
int field_id = -1) {
|
||||
return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
|
||||
}
|
||||
|
||||
// If no logical type, pass nullptr
|
||||
// A field_id -1 (or any negative value) will be serialized as null in Thrift
|
||||
static inline NodePtr Make(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields,
|
||||
std::shared_ptr<const LogicalType> logical_type,
|
||||
int field_id = -1) {
|
||||
return NodePtr(
|
||||
new GroupNode(name, repetition, fields, std::move(logical_type), field_id));
|
||||
}
|
||||
|
||||
bool Equals(const Node* other) const override;
|
||||
|
||||
const NodePtr& field(int i) const { return fields_[i]; }
|
||||
// Get the index of a field by its name, or negative value if not found.
|
||||
// If several fields share the same name, it is unspecified which one
|
||||
// is returned.
|
||||
int FieldIndex(const std::string& name) const;
|
||||
// Get the index of a field by its node, or negative value if not found.
|
||||
int FieldIndex(const Node& node) const;
|
||||
|
||||
int field_count() const { return static_cast<int>(fields_.size()); }
|
||||
|
||||
void ToParquet(void* element) const override;
|
||||
void Visit(Visitor* visitor) override;
|
||||
void VisitConst(ConstVisitor* visitor) const override;
|
||||
|
||||
/// \brief Return true if this node or any child node has REPEATED repetition
|
||||
/// type
|
||||
bool HasRepeatedFields() const;
|
||||
|
||||
private:
|
||||
GroupNode(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields,
|
||||
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
|
||||
|
||||
GroupNode(const std::string& name, Repetition::type repetition,
|
||||
const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
|
||||
int field_id = -1);
|
||||
|
||||
NodeVector fields_;
|
||||
bool EqualsInternal(const GroupNode* other) const;
|
||||
|
||||
// Mapping between field name to the field index
|
||||
std::unordered_multimap<std::string, int> field_name_to_idx_;
|
||||
|
||||
FRIEND_TEST(TestGroupNode, Attrs);
|
||||
FRIEND_TEST(TestGroupNode, Equals);
|
||||
FRIEND_TEST(TestGroupNode, FieldIndex);
|
||||
FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Convenience primitive type factory functions
|
||||
|
||||
#define PRIMITIVE_FACTORY(FuncName, TYPE) \
|
||||
static inline NodePtr FuncName(const std::string& name, \
|
||||
Repetition::type repetition = Repetition::OPTIONAL, \
|
||||
int field_id = -1) { \
|
||||
return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
|
||||
/*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
|
||||
}
|
||||
|
||||
PRIMITIVE_FACTORY(Boolean, BOOLEAN)
|
||||
PRIMITIVE_FACTORY(Int32, INT32)
|
||||
PRIMITIVE_FACTORY(Int64, INT64)
|
||||
PRIMITIVE_FACTORY(Int96, INT96)
|
||||
PRIMITIVE_FACTORY(Float, FLOAT)
|
||||
PRIMITIVE_FACTORY(Double, DOUBLE)
|
||||
PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
|
||||
|
||||
void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
|
||||
int indent_width = 2);
|
||||
|
||||
} // namespace schema
|
||||
|
||||
// The ColumnDescriptor encapsulates information necessary to interpret
|
||||
// primitive column data in the context of a particular schema. We have to
|
||||
// examine the node structure of a column's path to the root in the schema tree
|
||||
// to be able to reassemble the nested structure from the repetition and
|
||||
// definition levels.
|
||||
class PARQUET_EXPORT ColumnDescriptor {
|
||||
public:
|
||||
ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
|
||||
int16_t max_repetition_level,
|
||||
const SchemaDescriptor* schema_descr = NULLPTR);
|
||||
|
||||
bool Equals(const ColumnDescriptor& other) const;
|
||||
|
||||
int16_t max_definition_level() const { return max_definition_level_; }
|
||||
|
||||
int16_t max_repetition_level() const { return max_repetition_level_; }
|
||||
|
||||
Type::type physical_type() const { return primitive_node_->physical_type(); }
|
||||
|
||||
ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
|
||||
|
||||
const std::shared_ptr<const LogicalType>& logical_type() const {
|
||||
return primitive_node_->logical_type();
|
||||
}
|
||||
|
||||
ColumnOrder column_order() const { return primitive_node_->column_order(); }
|
||||
|
||||
SortOrder::type sort_order() const {
|
||||
const auto& la = logical_type();
|
||||
auto pt = physical_type();
|
||||
return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
|
||||
}
|
||||
|
||||
const std::string& name() const { return primitive_node_->name(); }
|
||||
|
||||
const std::shared_ptr<schema::ColumnPath> path() const;
|
||||
|
||||
const schema::NodePtr& schema_node() const { return node_; }
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
int type_length() const;
|
||||
|
||||
int type_precision() const;
|
||||
|
||||
int type_scale() const;
|
||||
|
||||
private:
|
||||
schema::NodePtr node_;
|
||||
const schema::PrimitiveNode* primitive_node_;
|
||||
|
||||
int16_t max_definition_level_;
|
||||
int16_t max_repetition_level_;
|
||||
};
|
||||
|
||||
// Container for the converted Parquet schema with a computed information from
|
||||
// the schema analysis needed for file reading
|
||||
//
|
||||
// * Column index to Node
|
||||
// * Max repetition / definition levels for each primitive node
|
||||
//
|
||||
// The ColumnDescriptor objects produced by this class can be used to assist in
|
||||
// the reconstruction of fully materialized data structures from the
|
||||
// repetition-definition level encoding of nested data
|
||||
//
|
||||
// TODO(wesm): this object can be recomputed from a Schema
|
||||
class PARQUET_EXPORT SchemaDescriptor {
|
||||
public:
|
||||
SchemaDescriptor() = default;
|
||||
~SchemaDescriptor() = default;
|
||||
|
||||
// Analyze the schema
|
||||
void Init(std::unique_ptr<schema::Node> schema);
|
||||
void Init(schema::NodePtr schema);
|
||||
|
||||
const ColumnDescriptor* Column(int i) const;
|
||||
|
||||
// Get the index of a column by its dotstring path, or negative value if not found.
|
||||
// If several columns share the same dotstring path, it is unspecified which one
|
||||
// is returned.
|
||||
int ColumnIndex(const std::string& node_path) const;
|
||||
// Get the index of a column by its node, or negative value if not found.
|
||||
int ColumnIndex(const schema::Node& node) const;
|
||||
|
||||
bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const;
|
||||
|
||||
// The number of physical columns appearing in the file
|
||||
int num_columns() const { return static_cast<int>(leaves_.size()); }
|
||||
|
||||
const schema::NodePtr& schema_root() const { return schema_; }
|
||||
|
||||
const schema::GroupNode* group_node() const { return group_node_; }
|
||||
|
||||
// Returns the root (child of the schema root) node of the leaf(column) node
|
||||
const schema::Node* GetColumnRoot(int i) const;
|
||||
|
||||
const std::string& name() const { return group_node_->name(); }
|
||||
|
||||
std::string ToString() const;
|
||||
|
||||
void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
|
||||
|
||||
/// \brief Return column index corresponding to a particular
|
||||
/// PrimitiveNode. Returns -1 if not found
|
||||
int GetColumnIndex(const schema::PrimitiveNode& node) const;
|
||||
|
||||
/// \brief Return true if any field or their children have REPEATED repetition
|
||||
/// type
|
||||
bool HasRepeatedFields() const;
|
||||
|
||||
private:
|
||||
friend class ColumnDescriptor;
|
||||
|
||||
// Root Node
|
||||
schema::NodePtr schema_;
|
||||
// Root Node
|
||||
// Would never be NULLPTR.
|
||||
const schema::GroupNode* group_node_;
|
||||
|
||||
void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
|
||||
int16_t max_rep_level, const schema::NodePtr& base);
|
||||
|
||||
// Result of leaf node / tree analysis
|
||||
std::vector<ColumnDescriptor> leaves_;
|
||||
|
||||
std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
|
||||
|
||||
// Mapping between leaf nodes and root group of leaf (first node
|
||||
// below the schema's root group)
|
||||
//
|
||||
// For example, the leaf `a.b.c.d` would have a link back to `a`
|
||||
//
|
||||
// -- a <------
|
||||
// -- -- b |
|
||||
// -- -- -- c |
|
||||
// -- -- -- -- d
|
||||
std::unordered_map<int, schema::NodePtr> leaf_to_base_;
|
||||
|
||||
// Mapping between ColumnPath DotString to the leaf index
|
||||
std::unordered_multimap<std::string, int> leaf_to_idx_;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,102 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iosfwd>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/span.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/type_fwd.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// A structure for capturing metadata for estimating the unencoded,
|
||||
/// uncompressed size of data written. This is useful for readers to estimate
|
||||
/// how much memory is needed to reconstruct data in their memory model and for
|
||||
/// fine-grained filter push down on nested structures (the histograms contained
|
||||
/// in this structure can help determine the number of nulls at a particular
|
||||
/// nesting level and maximum length of lists).
|
||||
struct PARQUET_EXPORT SizeStatistics {
|
||||
/// When present, there is expected to be one element corresponding to each
|
||||
/// definition (i.e. size=max definition+1) where each element
|
||||
/// represents the number of times the definition level was observed in the
|
||||
/// data.
|
||||
///
|
||||
/// This field may be omitted (a.k.a. zero-length vector) if max_definition_level
|
||||
/// is 0 without loss of information.
|
||||
std::vector<int64_t> definition_level_histogram;
|
||||
|
||||
/// Same as definition_level_histogram except for repetition levels.
|
||||
///
|
||||
/// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level
|
||||
/// is 0 without loss of information.
|
||||
std::vector<int64_t> repetition_level_histogram;
|
||||
|
||||
/// The number of physical bytes stored for BYTE_ARRAY data values assuming
|
||||
/// no encoding. This is exclusive of the bytes needed to store the length of
|
||||
/// each byte array. In other words, this field is equivalent to the `(size
|
||||
/// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
|
||||
/// written)`. To determine unencoded sizes of other types readers can use
|
||||
/// schema information multiplied by the number of non-null and null values.
|
||||
/// The number of null/non-null values can be inferred from the histograms
|
||||
/// below.
|
||||
///
|
||||
/// For example, if a column chunk is dictionary-encoded with dictionary
|
||||
/// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
|
||||
/// then this value for that data page should be 7 (1 + 1 + 2 + 3).
|
||||
///
|
||||
/// This field should only be set for types that use BYTE_ARRAY as their
|
||||
/// physical type.
|
||||
std::optional<int64_t> unencoded_byte_array_data_bytes;
|
||||
|
||||
/// \brief Check if the SizeStatistics is set.
|
||||
bool is_set() const {
|
||||
return !repetition_level_histogram.empty() || !definition_level_histogram.empty() ||
|
||||
unencoded_byte_array_data_bytes.has_value();
|
||||
}
|
||||
|
||||
/// \brief Increment the unencoded byte array data bytes.
|
||||
void IncrementUnencodedByteArrayDataBytes(int64_t value);
|
||||
|
||||
/// \brief Merge two SizeStatistics.
|
||||
/// \throws ParquetException if SizeStatistics to merge is not compatible.
|
||||
void Merge(const SizeStatistics& other);
|
||||
|
||||
/// \brief Validate the SizeStatistics
|
||||
/// \throws ParquetException if the histograms don't have the right length,
|
||||
/// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column.
|
||||
void Validate(const ColumnDescriptor* descr) const;
|
||||
|
||||
/// \brief Reset the SizeStatistics to be empty.
|
||||
void Reset();
|
||||
|
||||
/// \brief Make an empty SizeStatistics object for specific type.
|
||||
static std::unique_ptr<SizeStatistics> Make(const ColumnDescriptor* descr);
|
||||
};
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::ostream& operator<<(std::ostream&, const SizeStatistics&);
|
||||
|
||||
PARQUET_EXPORT
|
||||
void UpdateLevelHistogram(::arrow::util::span<const int16_t> levels,
|
||||
::arrow::util::span<int64_t> histogram);
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,441 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class BinaryArray;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ColumnDescriptor;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Value comparator interfaces
|
||||
|
||||
/// \brief Base class for value comparators. Generally used with
|
||||
/// TypedComparator<T>
|
||||
class PARQUET_EXPORT Comparator {
|
||||
public:
|
||||
virtual ~Comparator() {}
|
||||
|
||||
/// \brief Create a comparator explicitly from physical type and
|
||||
/// sort order
|
||||
/// \param[in] physical_type the physical type for the typed
|
||||
/// comparator
|
||||
/// \param[in] sort_order either SortOrder::SIGNED or
|
||||
/// SortOrder::UNSIGNED
|
||||
/// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
|
||||
static std::shared_ptr<Comparator> Make(Type::type physical_type,
|
||||
SortOrder::type sort_order,
|
||||
int type_length = -1);
|
||||
|
||||
/// \brief Create typed comparator inferring default sort order from
|
||||
/// ColumnDescriptor
|
||||
/// \param[in] descr the Parquet column schema
|
||||
static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
|
||||
};
|
||||
|
||||
/// \brief Interface for comparison of physical types according to the
|
||||
/// semantics of a particular logical type.
|
||||
template <typename DType>
|
||||
class TypedComparator : public Comparator {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
/// \brief Scalar comparison of two elements, return true if first
|
||||
/// is strictly less than the second
|
||||
virtual bool Compare(const T& a, const T& b) const = 0;
|
||||
|
||||
/// \brief Compute maximum and minimum elements in a batch of
|
||||
/// elements without any nulls
|
||||
virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) const = 0;
|
||||
|
||||
/// \brief Compute minimum and maximum elements from an Arrow array. Only
|
||||
/// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
|
||||
/// / arrow::BinaryArray
|
||||
virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) const = 0;
|
||||
|
||||
/// \brief Compute maximum and minimum elements in a batch of
|
||||
/// elements with accompanying bitmap indicating which elements are
|
||||
/// included (bit set) and excluded (bit not set)
|
||||
///
|
||||
/// \param[in] values the sequence of values
|
||||
/// \param[in] length the length of the sequence
|
||||
/// \param[in] valid_bits a bitmap indicating which elements are
|
||||
/// included (1) or excluded (0)
|
||||
/// \param[in] valid_bits_offset the bit offset into the bitmap of
|
||||
/// the first element in the sequence
|
||||
virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
|
||||
const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset) const = 0;
|
||||
};
|
||||
|
||||
/// \brief Typed version of Comparator::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
|
||||
SortOrder::type sort_order,
|
||||
int type_length = -1) {
|
||||
return std::static_pointer_cast<TypedComparator<DType>>(
|
||||
Comparator::Make(physical_type, sort_order, type_length));
|
||||
}
|
||||
|
||||
/// \brief Typed version of Comparator::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
|
||||
return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
/// \brief Structure represented encoded statistics to be written to
|
||||
/// and read from Parquet serialized metadata.
|
||||
class PARQUET_EXPORT EncodedStatistics {
|
||||
std::string max_, min_;
|
||||
bool is_signed_ = false;
|
||||
|
||||
public:
|
||||
EncodedStatistics() = default;
|
||||
|
||||
const std::string& max() const { return max_; }
|
||||
const std::string& min() const { return min_; }
|
||||
|
||||
std::optional<bool> is_max_value_exact;
|
||||
std::optional<bool> is_min_value_exact;
|
||||
|
||||
int64_t null_count = 0;
|
||||
int64_t distinct_count = 0;
|
||||
|
||||
bool has_min = false;
|
||||
bool has_max = false;
|
||||
bool has_null_count = false;
|
||||
bool has_distinct_count = false;
|
||||
|
||||
// When all values in the statistics are null, it is set to true.
|
||||
// Otherwise, at least one value is not null, or we are not sure at all.
|
||||
// Page index requires this information to decide whether a data page
|
||||
// is a null page or not.
|
||||
bool all_null_value = false;
|
||||
|
||||
// From parquet-mr
|
||||
// Don't write stats larger than the max size rather than truncating. The
|
||||
// rationale is that some engines may use the minimum value in the page as
|
||||
// the true minimum for aggregations and there is no way to mark that a
|
||||
// value has been truncated and is a lower bound and not in the page.
|
||||
void ApplyStatSizeLimits(size_t length) {
|
||||
if (max_.length() > length) {
|
||||
has_max = false;
|
||||
max_.clear();
|
||||
is_max_value_exact = std::nullopt;
|
||||
}
|
||||
if (min_.length() > length) {
|
||||
has_min = false;
|
||||
min_.clear();
|
||||
is_min_value_exact = std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
// Clear Min Max.
|
||||
void ClearMinMax() {
|
||||
has_max = false;
|
||||
max_.clear();
|
||||
has_min = false;
|
||||
min_.clear();
|
||||
}
|
||||
|
||||
bool is_set() const {
|
||||
return has_min || has_max || has_null_count || has_distinct_count;
|
||||
}
|
||||
|
||||
bool is_signed() const { return is_signed_; }
|
||||
|
||||
void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
|
||||
|
||||
EncodedStatistics& set_max(std::string value) {
|
||||
max_ = std::move(value);
|
||||
has_max = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EncodedStatistics& set_min(std::string value) {
|
||||
min_ = std::move(value);
|
||||
has_min = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EncodedStatistics& set_null_count(int64_t value) {
|
||||
null_count = value;
|
||||
has_null_count = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
EncodedStatistics& set_distinct_count(int64_t value) {
|
||||
distinct_count = value;
|
||||
has_distinct_count = true;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Base type for computing column statistics while writing a file
|
||||
class PARQUET_EXPORT Statistics {
|
||||
public:
|
||||
virtual ~Statistics() {}
|
||||
|
||||
/// \brief Create a new statistics instance given a column schema
|
||||
/// definition
|
||||
/// \param[in] descr the column schema
|
||||
/// \param[in] pool a memory pool to use for any memory allocations, optional
|
||||
static std::shared_ptr<Statistics> Make(
|
||||
const ColumnDescriptor* descr,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
/// \brief Create a new statistics instance given a column schema
|
||||
/// definition and preexisting state
|
||||
/// \param[in] descr the column schema
|
||||
/// \param[in] encoded_min the encoded minimum value
|
||||
/// \param[in] encoded_max the encoded maximum value
|
||||
/// \param[in] num_values total number of values
|
||||
/// \param[in] null_count number of null values
|
||||
/// \param[in] distinct_count number of distinct values
|
||||
/// \param[in] has_min_max whether the min/max statistics are set
|
||||
/// \param[in] has_null_count whether the null_count statistics are set
|
||||
/// \param[in] has_distinct_count whether the distinct_count statistics are set
|
||||
/// \param[in] pool a memory pool to use for any memory allocations, optional
|
||||
static std::shared_ptr<Statistics> Make(
|
||||
const ColumnDescriptor* descr, const std::string& encoded_min,
|
||||
const std::string& encoded_max, int64_t num_values, int64_t null_count,
|
||||
int64_t distinct_count, bool has_min_max, bool has_null_count,
|
||||
bool has_distinct_count,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
/// \brief Create a new statistics instance given a column schema
|
||||
/// definition and preexisting state
|
||||
/// \param[in] descr the column schema
|
||||
/// \param[in] encoded_min the encoded minimum value
|
||||
/// \param[in] encoded_max the encoded maximum value
|
||||
/// \param[in] num_values total number of values
|
||||
/// \param[in] null_count number of null values
|
||||
/// \param[in] distinct_count number of distinct values
|
||||
/// \param[in] has_min_max whether the min/max statistics are set
|
||||
/// \param[in] has_null_count whether the null_count statistics are set
|
||||
/// \param[in] has_distinct_count whether the distinct_count statistics are set
|
||||
/// \param[in] is_min_value_exact whether the min value is exact
|
||||
/// \param[in] is_max_value_exact whether the max value is exact
|
||||
/// \param[in] pool a memory pool to use for any memory allocations, optional
|
||||
static std::shared_ptr<Statistics> Make(
|
||||
const ColumnDescriptor* descr, const std::string& encoded_min,
|
||||
const std::string& encoded_max, int64_t num_values, int64_t null_count,
|
||||
int64_t distinct_count, bool has_min_max, bool has_null_count,
|
||||
bool has_distinct_count, std::optional<bool> is_min_value_exact,
|
||||
std::optional<bool> is_max_value_exact,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
// Helper function to convert EncodedStatistics to Statistics.
|
||||
// EncodedStatistics does not contain number of non-null values, and it can be
|
||||
// passed using the num_values parameter.
|
||||
static std::shared_ptr<Statistics> Make(
|
||||
const ColumnDescriptor* descr, const EncodedStatistics* encoded_statistics,
|
||||
int64_t num_values = -1,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
|
||||
|
||||
/// \brief Return true if the count of null values is set
|
||||
virtual bool HasNullCount() const = 0;
|
||||
|
||||
/// \brief The number of null values, may not be set
|
||||
virtual int64_t null_count() const = 0;
|
||||
|
||||
/// \brief Return true if the count of distinct values is set
|
||||
virtual bool HasDistinctCount() const = 0;
|
||||
|
||||
/// \brief The number of distinct values, may not be set
|
||||
virtual int64_t distinct_count() const = 0;
|
||||
|
||||
/// \brief The number of non-null values in the column
|
||||
virtual int64_t num_values() const = 0;
|
||||
|
||||
/// \brief Return true if both min and max statistics are set. Obtain
|
||||
/// with TypedStatistics<T>::min and max
|
||||
virtual bool HasMinMax() const = 0;
|
||||
|
||||
/// \brief Reset state of object to initial (no data observed) state
|
||||
virtual void Reset() = 0;
|
||||
|
||||
/// \brief Plain-encoded minimum value
|
||||
virtual std::string EncodeMin() const = 0;
|
||||
|
||||
/// \brief Plain-encoded maximum value
|
||||
virtual std::string EncodeMax() const = 0;
|
||||
|
||||
/// \brief Return the minimum value exact flag if set.
|
||||
/// It will be true if there was no truncation.
|
||||
virtual std::optional<bool> is_min_value_exact() const = 0;
|
||||
|
||||
/// \brief Return the maximum value exact flag if set.
|
||||
/// It will be true if there was no truncation.
|
||||
virtual std::optional<bool> is_max_value_exact() const = 0;
|
||||
|
||||
/// \brief The finalized encoded form of the statistics for transport
|
||||
virtual EncodedStatistics Encode() = 0;
|
||||
|
||||
/// \brief The physical type of the column schema
|
||||
virtual Type::type physical_type() const = 0;
|
||||
|
||||
/// \brief The full type descriptor from the column schema
|
||||
virtual const ColumnDescriptor* descr() const = 0;
|
||||
|
||||
/// \brief Check two Statistics for equality
|
||||
virtual bool Equals(const Statistics& other) const = 0;
|
||||
|
||||
protected:
|
||||
static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
|
||||
const void* max, int64_t num_values,
|
||||
int64_t null_count, int64_t distinct_count);
|
||||
};
|
||||
|
||||
/// \brief A typed implementation of Statistics
|
||||
template <typename DType>
|
||||
class TypedStatistics : public Statistics {
|
||||
public:
|
||||
using T = typename DType::c_type;
|
||||
|
||||
/// \brief The current minimum value
|
||||
virtual const T& min() const = 0;
|
||||
|
||||
/// \brief The current maximum value
|
||||
virtual const T& max() const = 0;
|
||||
|
||||
/// \brief Update state with state of another Statistics object
|
||||
virtual void Merge(const TypedStatistics<DType>& other) = 0;
|
||||
|
||||
/// \brief Batch statistics update
|
||||
virtual void Update(const T* values, int64_t num_values, int64_t null_count) = 0;
|
||||
|
||||
/// \brief Batch statistics update with supplied validity bitmap
|
||||
/// \param[in] values pointer to column values
|
||||
/// \param[in] valid_bits Pointer to bitmap representing if values are non-null.
|
||||
/// \param[in] valid_bits_offset Offset offset into valid_bits where the slice of
|
||||
/// data begins.
|
||||
/// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
|
||||
/// when calculating statistics. This can be smaller than
|
||||
/// num_values+null_count as null_count can include nulls
|
||||
/// from parents while num_spaced_values does not.
|
||||
/// \param[in] num_values Number of values that are not null.
|
||||
/// \param[in] null_count Number of values that are null.
|
||||
virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
|
||||
int64_t valid_bits_offset, int64_t num_spaced_values,
|
||||
int64_t num_values, int64_t null_count) = 0;
|
||||
|
||||
/// \brief EXPERIMENTAL: Update statistics with an Arrow array without
|
||||
/// conversion to a primitive Parquet C type. Only implemented for certain
|
||||
/// Parquet type / Arrow type combinations like BYTE_ARRAY /
|
||||
/// arrow::BinaryArray
|
||||
///
|
||||
/// If update_counts is true then the null_count and num_values will be updated
|
||||
/// based on the null_count of values. Set to false if these are updated
|
||||
/// elsewhere (e.g. when updating a dictionary where the counts are taken from
|
||||
/// the indices and not the values)
|
||||
virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0;
|
||||
|
||||
/// \brief Set min and max values to particular values
|
||||
virtual void SetMinMax(const T& min, const T& max) = 0;
|
||||
|
||||
/// \brief Increments the null count directly
|
||||
/// Use Update to extract the null count from data. Use this if you determine
|
||||
/// the null count through some other means (e.g. dictionary arrays where the
|
||||
/// null count is determined from the indices)
|
||||
virtual void IncrementNullCount(int64_t n) = 0;
|
||||
|
||||
/// \brief Increments the number of values directly
|
||||
/// The same note on IncrementNullCount applies here
|
||||
virtual void IncrementNumValues(int64_t n) = 0;
|
||||
};
|
||||
|
||||
using BoolStatistics = TypedStatistics<BooleanType>;
|
||||
using Int32Statistics = TypedStatistics<Int32Type>;
|
||||
using Int64Statistics = TypedStatistics<Int64Type>;
|
||||
using FloatStatistics = TypedStatistics<FloatType>;
|
||||
using DoubleStatistics = TypedStatistics<DoubleType>;
|
||||
using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
|
||||
using FLBAStatistics = TypedStatistics<FLBAType>;
|
||||
|
||||
/// \brief Typed version of Statistics::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
|
||||
const ColumnDescriptor* descr,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
|
||||
}
|
||||
|
||||
/// \brief Create Statistics initialized to a particular state
|
||||
/// \param[in] min the minimum value
|
||||
/// \param[in] max the minimum value
|
||||
/// \param[in] num_values number of values
|
||||
/// \param[in] null_count number of null values
|
||||
/// \param[in] distinct_count number of distinct values
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
|
||||
const typename DType::c_type& max,
|
||||
int64_t num_values,
|
||||
int64_t null_count,
|
||||
int64_t distinct_count) {
|
||||
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
|
||||
DType::type_num, &min, &max, num_values, null_count, distinct_count));
|
||||
}
|
||||
|
||||
/// \brief Typed version of Statistics::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
|
||||
const ColumnDescriptor* descr, const std::string& encoded_min,
|
||||
const std::string& encoded_max, int64_t num_values, int64_t null_count,
|
||||
int64_t distinct_count, bool has_min_max, bool has_null_count,
|
||||
bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
|
||||
descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
|
||||
has_min_max, has_null_count, has_distinct_count,
|
||||
/*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool));
|
||||
}
|
||||
|
||||
/// \brief Typed version of Statistics::Make
|
||||
template <typename DType>
|
||||
std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
|
||||
const ColumnDescriptor* descr, const std::string& encoded_min,
|
||||
const std::string& encoded_max, int64_t num_values, int64_t null_count,
|
||||
int64_t distinct_count, bool has_min_max, bool has_null_count,
|
||||
bool has_distinct_count, std::optional<bool> is_min_value_exact,
|
||||
std::optional<bool> is_max_value_exact,
|
||||
::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
|
||||
return std::static_pointer_cast<TypedStatistics<DType>>(
|
||||
Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
|
||||
distinct_count, has_min_max, has_null_count, has_distinct_count,
|
||||
is_min_value_exact, is_max_value_exact, pool));
|
||||
}
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,303 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/file_reader.h"
|
||||
#include "parquet/stream_writer.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// \brief A class for reading Parquet files using an output stream type API.
|
||||
///
|
||||
/// The values given must be of the correct type i.e. the type must
|
||||
/// match the file schema exactly otherwise a ParquetException will be
|
||||
/// thrown.
|
||||
///
|
||||
/// The user must explicitly advance to the next row using the
|
||||
/// EndRow() function or EndRow input manipulator.
|
||||
///
|
||||
/// Required and optional fields are supported:
|
||||
/// - Required fields are read using operator>>(T)
|
||||
/// - Optional fields are read with
|
||||
/// operator>>(std::optional<T>)
|
||||
///
|
||||
/// Note that operator>>(std::optional<T>) can be used to read
|
||||
/// required fields.
|
||||
///
|
||||
/// Similarly operator>>(T) can be used to read optional fields.
|
||||
/// However, if the value is not present then a ParquetException will
|
||||
/// be raised.
|
||||
///
|
||||
/// Currently there is no support for repeated fields.
|
||||
///
|
||||
class PARQUET_EXPORT StreamReader {
|
||||
public:
|
||||
template <typename T>
|
||||
using optional = ::std::optional<T>;
|
||||
|
||||
// N.B. Default constructed objects are not usable. This
|
||||
// constructor is provided so that the object may be move
|
||||
// assigned afterwards.
|
||||
StreamReader() = default;
|
||||
|
||||
explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
|
||||
|
||||
~StreamReader() = default;
|
||||
|
||||
bool eof() const { return eof_; }
|
||||
|
||||
int current_column() const { return column_index_; }
|
||||
|
||||
int64_t current_row() const { return current_row_; }
|
||||
|
||||
int num_columns() const;
|
||||
|
||||
int64_t num_rows() const;
|
||||
|
||||
// Moving is possible.
|
||||
StreamReader(StreamReader&&) = default;
|
||||
StreamReader& operator=(StreamReader&&) = default;
|
||||
|
||||
// Copying is not allowed.
|
||||
StreamReader(const StreamReader&) = delete;
|
||||
StreamReader& operator=(const StreamReader&) = delete;
|
||||
|
||||
StreamReader& operator>>(bool& v);
|
||||
|
||||
StreamReader& operator>>(int8_t& v);
|
||||
|
||||
StreamReader& operator>>(uint8_t& v);
|
||||
|
||||
StreamReader& operator>>(int16_t& v);
|
||||
|
||||
StreamReader& operator>>(uint16_t& v);
|
||||
|
||||
StreamReader& operator>>(int32_t& v);
|
||||
|
||||
StreamReader& operator>>(uint32_t& v);
|
||||
|
||||
StreamReader& operator>>(int64_t& v);
|
||||
|
||||
StreamReader& operator>>(uint64_t& v);
|
||||
|
||||
StreamReader& operator>>(std::chrono::milliseconds& v);
|
||||
|
||||
StreamReader& operator>>(std::chrono::microseconds& v);
|
||||
|
||||
StreamReader& operator>>(float& v);
|
||||
|
||||
StreamReader& operator>>(double& v);
|
||||
|
||||
StreamReader& operator>>(char& v);
|
||||
|
||||
template <int N>
|
||||
StreamReader& operator>>(char (&v)[N]) {
|
||||
ReadFixedLength(v, N);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
StreamReader& operator>>(std::array<char, N>& v) {
|
||||
ReadFixedLength(v.data(), static_cast<int>(N));
|
||||
return *this;
|
||||
}
|
||||
|
||||
// N.B. Cannot allow for reading to a arbitrary char pointer as the
|
||||
// length cannot be verified. Also it would overshadow the
|
||||
// char[N] input operator.
|
||||
// StreamReader& operator>>(char * v);
|
||||
|
||||
StreamReader& operator>>(std::string& v);
|
||||
|
||||
StreamReader& operator>>(::arrow::Decimal128& v);
|
||||
|
||||
// Input operators for optional fields.
|
||||
|
||||
StreamReader& operator>>(optional<bool>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int8_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint8_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int16_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint16_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int32_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint32_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<int64_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<uint64_t>& v);
|
||||
|
||||
StreamReader& operator>>(optional<float>& v);
|
||||
|
||||
StreamReader& operator>>(optional<double>& v);
|
||||
|
||||
StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
|
||||
|
||||
StreamReader& operator>>(optional<std::chrono::microseconds>& v);
|
||||
|
||||
StreamReader& operator>>(optional<char>& v);
|
||||
|
||||
StreamReader& operator>>(optional<std::string>& v);
|
||||
|
||||
StreamReader& operator>>(optional<::arrow::Decimal128>& v);
|
||||
|
||||
template <std::size_t N>
|
||||
StreamReader& operator>>(optional<std::array<char, N>>& v) {
|
||||
CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
|
||||
FixedLenByteArray flba;
|
||||
if (ReadOptional(&flba)) {
|
||||
v = std::array<char, N>{};
|
||||
std::memcpy(v->data(), flba.ptr, N);
|
||||
} else {
|
||||
v.reset();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Terminate current row and advance to next one.
|
||||
/// \throws ParquetException if all columns in the row were not
|
||||
/// read or skipped.
|
||||
void EndRow();
|
||||
|
||||
/// \brief Skip the data in the next columns.
|
||||
/// If the number of columns exceeds the columns remaining on the
|
||||
/// current row then skipping is terminated - it does _not_ continue
|
||||
/// skipping columns on the next row.
|
||||
/// Skipping of columns still requires the use 'EndRow' even if all
|
||||
/// remaining columns were skipped.
|
||||
/// \return Number of columns actually skipped.
|
||||
int64_t SkipColumns(int64_t num_columns_to_skip);
|
||||
|
||||
/// \brief Skip the data in the next rows.
|
||||
/// Skipping of rows is not allowed if reading of data for the
|
||||
/// current row is not finished.
|
||||
/// Skipping of rows will be terminated if the end of file is
|
||||
/// reached.
|
||||
/// \return Number of rows actually skipped.
|
||||
int64_t SkipRows(int64_t num_rows_to_skip);
|
||||
|
||||
protected:
|
||||
[[noreturn]] void ThrowReadFailedException(
|
||||
const std::shared_ptr<schema::PrimitiveNode>& node);
|
||||
|
||||
template <typename ReaderType, typename T>
|
||||
void Read(T* v) {
|
||||
const auto& node = nodes_[column_index_];
|
||||
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
|
||||
int16_t def_level;
|
||||
int16_t rep_level;
|
||||
int64_t values_read;
|
||||
|
||||
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
|
||||
|
||||
if (values_read != 1) {
|
||||
ThrowReadFailedException(node);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ReaderType, typename ReadType, typename T>
|
||||
void Read(T* v) {
|
||||
const auto& node = nodes_[column_index_];
|
||||
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
|
||||
int16_t def_level;
|
||||
int16_t rep_level;
|
||||
ReadType tmp;
|
||||
int64_t values_read;
|
||||
|
||||
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
|
||||
|
||||
if (values_read == 1) {
|
||||
*v = tmp;
|
||||
} else {
|
||||
ThrowReadFailedException(node);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
|
||||
void ReadOptional(optional<T>* v) {
|
||||
const auto& node = nodes_[column_index_];
|
||||
auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
|
||||
int16_t def_level;
|
||||
int16_t rep_level;
|
||||
ReadType tmp;
|
||||
int64_t values_read;
|
||||
|
||||
reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
|
||||
|
||||
if (values_read == 1) {
|
||||
*v = T(tmp);
|
||||
} else if ((values_read == 0) && (def_level == 0)) {
|
||||
v->reset();
|
||||
} else {
|
||||
ThrowReadFailedException(node);
|
||||
}
|
||||
}
|
||||
|
||||
void ReadFixedLength(char* ptr, int len);
|
||||
|
||||
void Read(ByteArray* v);
|
||||
|
||||
void Read(FixedLenByteArray* v);
|
||||
|
||||
bool ReadOptional(ByteArray* v);
|
||||
|
||||
bool ReadOptional(FixedLenByteArray* v);
|
||||
|
||||
void NextRowGroup();
|
||||
|
||||
void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
|
||||
int length = 0);
|
||||
|
||||
void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
|
||||
|
||||
void SetEof();
|
||||
|
||||
private:
|
||||
std::unique_ptr<ParquetFileReader> file_reader_;
|
||||
std::shared_ptr<FileMetaData> file_metadata_;
|
||||
std::shared_ptr<RowGroupReader> row_group_reader_;
|
||||
std::vector<std::shared_ptr<ColumnReader>> column_readers_;
|
||||
std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
|
||||
|
||||
bool eof_{true};
|
||||
int row_group_index_{0};
|
||||
int column_index_{0};
|
||||
int64_t current_row_{0};
|
||||
int64_t row_group_row_offset_{0};
|
||||
|
||||
static constexpr int64_t kBatchSizeOne = 1;
|
||||
}; // namespace parquet
|
||||
|
||||
PARQUET_EXPORT
|
||||
StreamReader& operator>>(StreamReader&, EndRowType);
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,252 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/util/span.h"
|
||||
|
||||
#include "parquet/column_writer.h"
|
||||
#include "parquet/file_writer.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// \brief A class for writing Parquet files using an output stream type API.
|
||||
///
|
||||
/// The values given must be of the correct type i.e. the type must
|
||||
/// match the file schema exactly otherwise a ParquetException will be
|
||||
/// thrown.
|
||||
///
|
||||
/// The user must explicitly indicate the end of the row using the
|
||||
/// EndRow() function or EndRow output manipulator.
|
||||
///
|
||||
/// A maximum row group size can be configured, the default size is
|
||||
/// 512MB. Alternatively the row group size can be set to zero and the
|
||||
/// user can create new row groups by calling the EndRowGroup()
|
||||
/// function or using the EndRowGroup output manipulator.
|
||||
///
|
||||
/// Required and optional fields are supported:
|
||||
/// - Required fields are written using operator<<(T)
|
||||
/// - Optional fields are written using
|
||||
/// operator<<(std::optional<T>).
|
||||
///
|
||||
/// Note that operator<<(T) can be used to write optional fields.
|
||||
///
|
||||
/// Similarly, operator<<(std::optional<T>) can be used to
|
||||
/// write required fields. However if the optional parameter does not
|
||||
/// have a value (i.e. it is nullopt) then a ParquetException will be
|
||||
/// raised.
|
||||
///
|
||||
/// Currently there is no support for repeated fields.
|
||||
///
|
||||
class PARQUET_EXPORT StreamWriter {
|
||||
public:
|
||||
template <typename T>
|
||||
using optional = ::std::optional<T>;
|
||||
|
||||
// N.B. Default constructed objects are not usable. This
|
||||
// constructor is provided so that the object may be move
|
||||
// assigned afterwards.
|
||||
StreamWriter() = default;
|
||||
|
||||
explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
|
||||
|
||||
~StreamWriter() = default;
|
||||
|
||||
static void SetDefaultMaxRowGroupSize(int64_t max_size);
|
||||
|
||||
void SetMaxRowGroupSize(int64_t max_size);
|
||||
|
||||
int current_column() const { return column_index_; }
|
||||
|
||||
int64_t current_row() const { return current_row_; }
|
||||
|
||||
int num_columns() const;
|
||||
|
||||
// Moving is possible.
|
||||
StreamWriter(StreamWriter&&) = default;
|
||||
StreamWriter& operator=(StreamWriter&&) = default;
|
||||
|
||||
// Copying is not allowed.
|
||||
StreamWriter(const StreamWriter&) = delete;
|
||||
StreamWriter& operator=(const StreamWriter&) = delete;
|
||||
|
||||
/// \brief Output operators for required fields.
|
||||
/// These can also be used for optional fields when a value must be set.
|
||||
StreamWriter& operator<<(bool v);
|
||||
|
||||
StreamWriter& operator<<(int8_t v);
|
||||
|
||||
StreamWriter& operator<<(uint8_t v);
|
||||
|
||||
StreamWriter& operator<<(int16_t v);
|
||||
|
||||
StreamWriter& operator<<(uint16_t v);
|
||||
|
||||
StreamWriter& operator<<(int32_t v);
|
||||
|
||||
StreamWriter& operator<<(uint32_t v);
|
||||
|
||||
StreamWriter& operator<<(int64_t v);
|
||||
|
||||
StreamWriter& operator<<(uint64_t v);
|
||||
|
||||
StreamWriter& operator<<(const std::chrono::milliseconds& v);
|
||||
|
||||
StreamWriter& operator<<(const std::chrono::microseconds& v);
|
||||
|
||||
StreamWriter& operator<<(float v);
|
||||
|
||||
StreamWriter& operator<<(double v);
|
||||
|
||||
StreamWriter& operator<<(char v);
|
||||
|
||||
/// \brief Helper class to write fixed length strings.
|
||||
/// This is useful as the standard string view (such as
|
||||
/// std::string_view) is for variable length data.
|
||||
struct PARQUET_EXPORT FixedStringView {
|
||||
FixedStringView() = default;
|
||||
|
||||
explicit FixedStringView(const char* data_ptr);
|
||||
|
||||
FixedStringView(const char* data_ptr, std::size_t data_len);
|
||||
|
||||
const char* data{NULLPTR};
|
||||
std::size_t size{0};
|
||||
};
|
||||
|
||||
/// \brief Output operators for fixed length strings.
|
||||
template <int N>
|
||||
StreamWriter& operator<<(const char (&v)[N]) {
|
||||
return WriteFixedLength(v, N);
|
||||
}
|
||||
template <std::size_t N>
|
||||
StreamWriter& operator<<(const std::array<char, N>& v) {
|
||||
return WriteFixedLength(v.data(), N);
|
||||
}
|
||||
StreamWriter& operator<<(FixedStringView v);
|
||||
|
||||
/// \brief Output operators for variable length strings.
|
||||
StreamWriter& operator<<(const char* v);
|
||||
StreamWriter& operator<<(const std::string& v);
|
||||
StreamWriter& operator<<(::std::string_view v);
|
||||
|
||||
/// \brief Helper class to write variable length raw data.
|
||||
using RawDataView = ::arrow::util::span<const uint8_t>;
|
||||
|
||||
/// \brief Output operators for variable length raw data.
|
||||
StreamWriter& operator<<(RawDataView v);
|
||||
|
||||
/// \brief Output operator for optional fields.
|
||||
template <typename T>
|
||||
StreamWriter& operator<<(const optional<T>& v) {
|
||||
if (v) {
|
||||
return operator<<(*v);
|
||||
}
|
||||
SkipOptionalColumn();
|
||||
return *this;
|
||||
}
|
||||
|
||||
/// \brief Skip the next N columns of optional data. If there are
|
||||
/// less than N columns remaining then the excess columns are
|
||||
/// ignored.
|
||||
/// \throws ParquetException if there is an attempt to skip any
|
||||
/// required column.
|
||||
/// \return Number of columns actually skipped.
|
||||
int64_t SkipColumns(int num_columns_to_skip);
|
||||
|
||||
/// \brief Terminate the current row and advance to next one.
|
||||
/// \throws ParquetException if all columns in the row were not
|
||||
/// written or skipped.
|
||||
void EndRow();
|
||||
|
||||
/// \brief Terminate the current row group and create new one.
|
||||
void EndRowGroup();
|
||||
|
||||
protected:
|
||||
template <typename WriterType, typename T>
|
||||
StreamWriter& Write(const T v) {
|
||||
auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
|
||||
|
||||
writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
|
||||
|
||||
if (max_row_group_size_ > 0) {
|
||||
row_group_size_ += writer->estimated_buffered_value_bytes();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len,
|
||||
ConvertedType::type converted_type);
|
||||
|
||||
StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
|
||||
|
||||
void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
|
||||
int length = -1);
|
||||
|
||||
/// \brief Skip the next column which must be optional.
|
||||
/// \throws ParquetException if the next column does not exist or is
|
||||
/// not optional.
|
||||
void SkipOptionalColumn();
|
||||
|
||||
void WriteNullValue(ColumnWriter* writer);
|
||||
|
||||
private:
|
||||
using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
|
||||
|
||||
struct null_deleter {
|
||||
void operator()(void*) {}
|
||||
};
|
||||
|
||||
int32_t column_index_{0};
|
||||
int64_t current_row_{0};
|
||||
int64_t row_group_size_{0};
|
||||
int64_t max_row_group_size_{default_row_group_size_};
|
||||
|
||||
std::unique_ptr<ParquetFileWriter> file_writer_;
|
||||
std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
|
||||
std::vector<node_ptr_type> nodes_;
|
||||
|
||||
static constexpr int16_t kDefLevelZero = 0;
|
||||
static constexpr int16_t kDefLevelOne = 1;
|
||||
static constexpr int16_t kRepLevelZero = 0;
|
||||
static constexpr int64_t kBatchSizeOne = 1;
|
||||
|
||||
static int64_t default_row_group_size_;
|
||||
};
|
||||
|
||||
struct PARQUET_EXPORT EndRowType {};
|
||||
constexpr EndRowType EndRow = {};
|
||||
|
||||
struct PARQUET_EXPORT EndRowGroupType {};
|
||||
constexpr EndRowGroupType EndRowGroup = {};
|
||||
|
||||
PARQUET_EXPORT
|
||||
StreamWriter& operator<<(StreamWriter&, EndRowType);
|
||||
|
||||
PARQUET_EXPORT
|
||||
StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,891 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This module defines an abstract interface for iterating through pages in a
|
||||
// Parquet column chunk within a row group. It could be extended in the future
|
||||
// to iterate through all data pages in all chunks in a file.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "arrow/extension_type.h"
|
||||
#include "arrow/io/memory.h"
|
||||
#include "arrow/testing/util.h"
|
||||
#include "arrow/util/float16.h"
|
||||
|
||||
#include "parquet/column_page.h"
|
||||
#include "parquet/column_reader.h"
|
||||
#include "parquet/column_writer.h"
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/platform.h"
|
||||
|
||||
// https://github.com/google/googletest/pull/2904 might not be available
|
||||
// in our version of gtest/gmock
|
||||
#define EXPECT_THROW_THAT(callable, ex_type, property) \
|
||||
EXPECT_THROW( \
|
||||
try { (callable)(); } catch (const ex_type& err) { \
|
||||
EXPECT_THAT(err, (property)); \
|
||||
throw; \
|
||||
}, \
|
||||
ex_type)
|
||||
|
||||
namespace parquet {
|
||||
|
||||
static constexpr int FLBA_LENGTH = 12;
|
||||
|
||||
inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
|
||||
return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
|
||||
}
|
||||
|
||||
namespace test {
|
||||
|
||||
typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, FloatType,
|
||||
DoubleType, ByteArrayType, FLBAType>
|
||||
ParquetTypes;
|
||||
|
||||
class ParquetTestException : public parquet::ParquetException {
|
||||
using ParquetException::ParquetException;
|
||||
};
|
||||
|
||||
const char* get_data_dir();
|
||||
std::string get_bad_data_dir();
|
||||
|
||||
std::string get_data_file(const std::string& filename, bool is_good = true);
|
||||
|
||||
template <typename T>
|
||||
static inline void assert_vector_equal(const std::vector<T>& left,
|
||||
const std::vector<T>& right) {
|
||||
ASSERT_EQ(left.size(), right.size());
|
||||
|
||||
for (size_t i = 0; i < left.size(); ++i) {
|
||||
ASSERT_EQ(left[i], right[i]) << i;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static inline bool vector_equal(const std::vector<T>& left, const std::vector<T>& right) {
|
||||
if (left.size() != right.size()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < left.size(); ++i) {
|
||||
if (left[i] != right[i]) {
|
||||
std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i]
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static std::vector<T> slice(const std::vector<T>& values, int start, int end) {
|
||||
if (end < start) {
|
||||
return std::vector<T>(0);
|
||||
}
|
||||
|
||||
std::vector<T> out(end - start);
|
||||
for (int i = start; i < end; ++i) {
|
||||
out[i - start] = values[i];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out);
|
||||
void random_bools(int n, double p, uint32_t seed, bool* out);
|
||||
|
||||
template <typename T>
|
||||
inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) {
|
||||
std::default_random_engine gen(seed);
|
||||
std::uniform_int_distribution<T> d(min_value, max_value);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
out[i] = d(gen);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void random_numbers(int n, uint32_t seed, float min_value, float max_value,
|
||||
float* out) {
|
||||
std::default_random_engine gen(seed);
|
||||
std::uniform_real_distribution<float> d(min_value, max_value);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
out[i] = d(gen);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void random_numbers(int n, uint32_t seed, double min_value, double max_value,
|
||||
double* out) {
|
||||
std::default_random_engine gen(seed);
|
||||
std::uniform_real_distribution<double> d(min_value, max_value);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
out[i] = d(gen);
|
||||
}
|
||||
}
|
||||
|
||||
void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
|
||||
Int96* out);
|
||||
|
||||
void random_float16_numbers(int n, uint32_t seed, ::arrow::util::Float16 min_value,
|
||||
::arrow::util::Float16 max_value, uint16_t* out);
|
||||
|
||||
void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out);
|
||||
|
||||
void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
|
||||
int max_size);
|
||||
|
||||
void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size);
|
||||
|
||||
void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out,
|
||||
int min_size, int max_size, double prefixed_probability);
|
||||
|
||||
void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out,
|
||||
double prefixed_probability);
|
||||
|
||||
template <typename Type, typename Sequence>
|
||||
std::shared_ptr<Buffer> EncodeValues(Encoding::type encoding, bool use_dictionary,
|
||||
const Sequence& values, int length,
|
||||
const ColumnDescriptor* descr) {
|
||||
auto encoder = MakeTypedEncoder<Type>(encoding, use_dictionary, descr);
|
||||
encoder->Put(values, length);
|
||||
return encoder->FlushValues();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void InitValues(int num_values, uint32_t seed, std::vector<T>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
random_numbers(num_values, seed, std::numeric_limits<T>::min(),
|
||||
std::numeric_limits<T>::max(), values.data());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void InitValues(int num_values, std::vector<T>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
InitValues(num_values, 0, values, buffer);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void InitDictValues(int num_values, int num_dicts, std::vector<T>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
int repeat_factor = num_values / num_dicts;
|
||||
InitValues<T>(num_dicts, values, buffer);
|
||||
// add some repeated values
|
||||
for (int j = 1; j < repeat_factor; ++j) {
|
||||
for (int i = 0; i < num_dicts; ++i) {
|
||||
std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T));
|
||||
}
|
||||
}
|
||||
// computed only dict_per_page * repeat_factor - 1 values < num_values
|
||||
// compute remaining
|
||||
for (int i = num_dicts * repeat_factor; i < num_values; ++i) {
|
||||
std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitDictValues<bool>(int num_values, int num_dicts, std::vector<bool>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
// No op for bool
|
||||
}
|
||||
|
||||
class MockPageReader : public PageReader {
|
||||
public:
|
||||
explicit MockPageReader(const std::vector<std::shared_ptr<Page>>& pages)
|
||||
: pages_(pages), page_index_(0) {}
|
||||
|
||||
std::shared_ptr<Page> NextPage() override {
|
||||
if (page_index_ == static_cast<int>(pages_.size())) {
|
||||
// EOS to consumer
|
||||
return std::shared_ptr<Page>(nullptr);
|
||||
}
|
||||
return pages_[page_index_++];
|
||||
}
|
||||
|
||||
// No-op
|
||||
void set_max_page_header_size(uint32_t size) override {}
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<Page>> pages_;
|
||||
int page_index_;
|
||||
};
|
||||
|
||||
// TODO(wesm): this is only used for testing for now. Refactor to form part of
|
||||
// primary file write path
|
||||
template <typename Type>
|
||||
class DataPageBuilder {
|
||||
public:
|
||||
using c_type = typename Type::c_type;
|
||||
|
||||
// This class writes data and metadata to the passed inputs
|
||||
explicit DataPageBuilder(ArrowOutputStream* sink)
|
||||
: sink_(sink),
|
||||
num_values_(0),
|
||||
encoding_(Encoding::PLAIN),
|
||||
definition_level_encoding_(Encoding::RLE),
|
||||
repetition_level_encoding_(Encoding::RLE),
|
||||
have_def_levels_(false),
|
||||
have_rep_levels_(false),
|
||||
have_values_(false) {}
|
||||
|
||||
void AppendDefLevels(const std::vector<int16_t>& levels, int16_t max_level,
|
||||
Encoding::type encoding = Encoding::RLE) {
|
||||
AppendLevels(levels, max_level, encoding);
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
|
||||
definition_level_encoding_ = encoding;
|
||||
have_def_levels_ = true;
|
||||
}
|
||||
|
||||
void AppendRepLevels(const std::vector<int16_t>& levels, int16_t max_level,
|
||||
Encoding::type encoding = Encoding::RLE) {
|
||||
AppendLevels(levels, max_level, encoding);
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
|
||||
repetition_level_encoding_ = encoding;
|
||||
have_rep_levels_ = true;
|
||||
}
|
||||
|
||||
void AppendValues(const ColumnDescriptor* d, const std::vector<c_type>& values,
|
||||
Encoding::type encoding = Encoding::PLAIN) {
|
||||
std::shared_ptr<Buffer> values_sink = EncodeValues<Type>(
|
||||
encoding, false, values.data(), static_cast<int>(values.size()), d);
|
||||
PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size()));
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
|
||||
encoding_ = encoding;
|
||||
have_values_ = true;
|
||||
}
|
||||
|
||||
int32_t num_values() const { return num_values_; }
|
||||
|
||||
Encoding::type encoding() const { return encoding_; }
|
||||
|
||||
Encoding::type rep_level_encoding() const { return repetition_level_encoding_; }
|
||||
|
||||
Encoding::type def_level_encoding() const { return definition_level_encoding_; }
|
||||
|
||||
private:
|
||||
ArrowOutputStream* sink_;
|
||||
|
||||
int32_t num_values_;
|
||||
Encoding::type encoding_;
|
||||
Encoding::type definition_level_encoding_;
|
||||
Encoding::type repetition_level_encoding_;
|
||||
|
||||
bool have_def_levels_;
|
||||
bool have_rep_levels_;
|
||||
bool have_values_;
|
||||
|
||||
// Used internally for both repetition and definition levels
|
||||
void AppendLevels(const std::vector<int16_t>& levels, int16_t max_level,
|
||||
Encoding::type encoding) {
|
||||
if (encoding != Encoding::RLE) {
|
||||
ParquetException::NYI("only rle encoding currently implemented");
|
||||
}
|
||||
|
||||
std::vector<uint8_t> encode_buffer(LevelEncoder::MaxBufferSize(
|
||||
Encoding::RLE, max_level, static_cast<int>(levels.size())));
|
||||
|
||||
// We encode into separate memory from the output stream because the
|
||||
// RLE-encoded bytes have to be preceded in the stream by their absolute
|
||||
// size.
|
||||
LevelEncoder encoder;
|
||||
encoder.Init(encoding, max_level, static_cast<int>(levels.size()),
|
||||
encode_buffer.data(), static_cast<int>(encode_buffer.size()));
|
||||
|
||||
encoder.Encode(static_cast<int>(levels.size()), levels.data());
|
||||
|
||||
int32_t rle_bytes = encoder.len();
|
||||
PARQUET_THROW_NOT_OK(
|
||||
sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes), sizeof(int32_t)));
|
||||
PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
inline void DataPageBuilder<BooleanType>::AppendValues(const ColumnDescriptor* d,
|
||||
const std::vector<bool>& values,
|
||||
Encoding::type encoding) {
|
||||
if (encoding != Encoding::PLAIN) {
|
||||
ParquetException::NYI("only plain encoding currently implemented");
|
||||
}
|
||||
|
||||
auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN, false, d);
|
||||
dynamic_cast<BooleanEncoder*>(encoder.get())
|
||||
->Put(values, static_cast<int>(values.size()));
|
||||
std::shared_ptr<Buffer> buffer = encoder->FlushValues();
|
||||
PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size()));
|
||||
|
||||
num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
|
||||
encoding_ = encoding;
|
||||
have_values_ = true;
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
static std::shared_ptr<DataPageV1> MakeDataPage(
|
||||
const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
|
||||
int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size,
|
||||
const std::vector<int16_t>& def_levels, int16_t max_def_level,
|
||||
const std::vector<int16_t>& rep_levels, int16_t max_rep_level) {
|
||||
int num_values = 0;
|
||||
|
||||
auto page_stream = CreateOutputStream();
|
||||
test::DataPageBuilder<Type> page_builder(page_stream.get());
|
||||
|
||||
if (!rep_levels.empty()) {
|
||||
page_builder.AppendRepLevels(rep_levels, max_rep_level);
|
||||
}
|
||||
if (!def_levels.empty()) {
|
||||
page_builder.AppendDefLevels(def_levels, max_def_level);
|
||||
}
|
||||
|
||||
if (encoding == Encoding::PLAIN) {
|
||||
page_builder.AppendValues(d, values, encoding);
|
||||
num_values = std::max(page_builder.num_values(), num_vals);
|
||||
} else { // DICTIONARY PAGES
|
||||
PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size));
|
||||
num_values = std::max(page_builder.num_values(), num_vals);
|
||||
}
|
||||
|
||||
PARQUET_ASSIGN_OR_THROW(auto buffer, page_stream->Finish());
|
||||
|
||||
return std::make_shared<DataPageV1>(buffer, num_values, encoding,
|
||||
page_builder.def_level_encoding(),
|
||||
page_builder.rep_level_encoding(), buffer->size());
|
||||
}
|
||||
|
||||
template <typename TYPE>
|
||||
class DictionaryPageBuilder {
|
||||
public:
|
||||
typedef typename TYPE::c_type TC;
|
||||
static constexpr int TN = TYPE::type_num;
|
||||
using SpecializedEncoder = typename EncodingTraits<TYPE>::Encoder;
|
||||
|
||||
// This class writes data and metadata to the passed inputs
|
||||
explicit DictionaryPageBuilder(const ColumnDescriptor* d)
|
||||
: num_dict_values_(0), have_values_(false) {
|
||||
auto encoder = MakeTypedEncoder<TYPE>(Encoding::PLAIN, true, d);
|
||||
dict_traits_ = dynamic_cast<DictEncoder<TYPE>*>(encoder.get());
|
||||
encoder_.reset(dynamic_cast<SpecializedEncoder*>(encoder.release()));
|
||||
}
|
||||
|
||||
~DictionaryPageBuilder() {}
|
||||
|
||||
std::shared_ptr<Buffer> AppendValues(const std::vector<TC>& values) {
|
||||
int num_values = static_cast<int>(values.size());
|
||||
// Dictionary encoding
|
||||
encoder_->Put(values.data(), num_values);
|
||||
num_dict_values_ = dict_traits_->num_entries();
|
||||
have_values_ = true;
|
||||
return encoder_->FlushValues();
|
||||
}
|
||||
|
||||
std::shared_ptr<Buffer> WriteDict() {
|
||||
std::shared_ptr<Buffer> dict_buffer =
|
||||
AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size());
|
||||
dict_traits_->WriteDict(dict_buffer->mutable_data());
|
||||
return dict_buffer;
|
||||
}
|
||||
|
||||
int32_t num_values() const { return num_dict_values_; }
|
||||
|
||||
private:
|
||||
DictEncoder<TYPE>* dict_traits_;
|
||||
std::unique_ptr<SpecializedEncoder> encoder_;
|
||||
int32_t num_dict_values_;
|
||||
bool have_values_;
|
||||
};
|
||||
|
||||
template <>
|
||||
inline DictionaryPageBuilder<BooleanType>::DictionaryPageBuilder(
|
||||
const ColumnDescriptor* d) {
|
||||
ParquetException::NYI("only plain encoding currently implemented for boolean");
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::WriteDict() {
|
||||
ParquetException::NYI("only plain encoding currently implemented for boolean");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::AppendValues(
|
||||
const std::vector<TC>& values) {
|
||||
ParquetException::NYI("only plain encoding currently implemented for boolean");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
inline static std::shared_ptr<DictionaryPage> MakeDictPage(
|
||||
const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
|
||||
const std::vector<int>& values_per_page, Encoding::type encoding,
|
||||
std::vector<std::shared_ptr<Buffer>>& rle_indices) {
|
||||
test::DictionaryPageBuilder<Type> page_builder(d);
|
||||
int num_pages = static_cast<int>(values_per_page.size());
|
||||
int value_start = 0;
|
||||
|
||||
for (int i = 0; i < num_pages; i++) {
|
||||
rle_indices.push_back(page_builder.AppendValues(
|
||||
slice(values, value_start, value_start + values_per_page[i])));
|
||||
value_start += values_per_page[i];
|
||||
}
|
||||
|
||||
auto buffer = page_builder.WriteDict();
|
||||
|
||||
return std::make_shared<DictionaryPage>(buffer, page_builder.num_values(),
|
||||
Encoding::PLAIN);
|
||||
}
|
||||
|
||||
// Given def/rep levels and values create multiple dict pages
|
||||
template <typename Type>
|
||||
inline static void PaginateDict(const ColumnDescriptor* d,
|
||||
const std::vector<typename Type::c_type>& values,
|
||||
const std::vector<int16_t>& def_levels,
|
||||
int16_t max_def_level,
|
||||
const std::vector<int16_t>& rep_levels,
|
||||
int16_t max_rep_level, int num_levels_per_page,
|
||||
const std::vector<int>& values_per_page,
|
||||
std::vector<std::shared_ptr<Page>>& pages,
|
||||
Encoding::type encoding = Encoding::RLE_DICTIONARY) {
|
||||
int num_pages = static_cast<int>(values_per_page.size());
|
||||
std::vector<std::shared_ptr<Buffer>> rle_indices;
|
||||
std::shared_ptr<DictionaryPage> dict_page =
|
||||
MakeDictPage<Type>(d, values, values_per_page, encoding, rle_indices);
|
||||
pages.push_back(dict_page);
|
||||
int def_level_start = 0;
|
||||
int def_level_end = 0;
|
||||
int rep_level_start = 0;
|
||||
int rep_level_end = 0;
|
||||
for (int i = 0; i < num_pages; i++) {
|
||||
if (max_def_level > 0) {
|
||||
def_level_start = i * num_levels_per_page;
|
||||
def_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
if (max_rep_level > 0) {
|
||||
rep_level_start = i * num_levels_per_page;
|
||||
rep_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
std::shared_ptr<DataPageV1> data_page = MakeDataPage<Int32Type>(
|
||||
d, {}, values_per_page[i], encoding, rle_indices[i]->data(),
|
||||
static_cast<int>(rle_indices[i]->size()),
|
||||
slice(def_levels, def_level_start, def_level_end), max_def_level,
|
||||
slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
|
||||
pages.push_back(data_page);
|
||||
}
|
||||
}
|
||||
|
||||
// Given def/rep levels and values create multiple plain pages
|
||||
template <typename Type>
|
||||
static inline void PaginatePlain(const ColumnDescriptor* d,
|
||||
const std::vector<typename Type::c_type>& values,
|
||||
const std::vector<int16_t>& def_levels,
|
||||
int16_t max_def_level,
|
||||
const std::vector<int16_t>& rep_levels,
|
||||
int16_t max_rep_level, int num_levels_per_page,
|
||||
const std::vector<int>& values_per_page,
|
||||
std::vector<std::shared_ptr<Page>>& pages,
|
||||
Encoding::type encoding = Encoding::PLAIN) {
|
||||
int num_pages = static_cast<int>(values_per_page.size());
|
||||
int def_level_start = 0;
|
||||
int def_level_end = 0;
|
||||
int rep_level_start = 0;
|
||||
int rep_level_end = 0;
|
||||
int value_start = 0;
|
||||
for (int i = 0; i < num_pages; i++) {
|
||||
if (max_def_level > 0) {
|
||||
def_level_start = i * num_levels_per_page;
|
||||
def_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
if (max_rep_level > 0) {
|
||||
rep_level_start = i * num_levels_per_page;
|
||||
rep_level_end = (i + 1) * num_levels_per_page;
|
||||
}
|
||||
std::shared_ptr<DataPage> page = MakeDataPage<Type>(
|
||||
d, slice(values, value_start, value_start + values_per_page[i]),
|
||||
values_per_page[i], encoding, nullptr, 0,
|
||||
slice(def_levels, def_level_start, def_level_end), max_def_level,
|
||||
slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
|
||||
pages.push_back(page);
|
||||
value_start += values_per_page[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Generates pages from randomly generated data
|
||||
template <typename Type>
|
||||
static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page,
|
||||
std::vector<int16_t>& def_levels,
|
||||
std::vector<int16_t>& rep_levels,
|
||||
std::vector<typename Type::c_type>& values,
|
||||
std::vector<uint8_t>& buffer,
|
||||
std::vector<std::shared_ptr<Page>>& pages,
|
||||
Encoding::type encoding = Encoding::PLAIN,
|
||||
uint32_t seed = 0) {
|
||||
int num_levels = levels_per_page * num_pages;
|
||||
int num_values = 0;
|
||||
int16_t zero = 0;
|
||||
int16_t max_def_level = d->max_definition_level();
|
||||
int16_t max_rep_level = d->max_repetition_level();
|
||||
std::vector<int> values_per_page(num_pages, levels_per_page);
|
||||
// Create definition levels
|
||||
if (max_def_level > 0 && num_levels != 0) {
|
||||
def_levels.resize(num_levels);
|
||||
random_numbers(num_levels, seed, zero, max_def_level, def_levels.data());
|
||||
for (int p = 0; p < num_pages; p++) {
|
||||
int num_values_per_page = 0;
|
||||
for (int i = 0; i < levels_per_page; i++) {
|
||||
if (def_levels[i + p * levels_per_page] == max_def_level) {
|
||||
num_values_per_page++;
|
||||
num_values++;
|
||||
}
|
||||
}
|
||||
values_per_page[p] = num_values_per_page;
|
||||
}
|
||||
} else {
|
||||
num_values = num_levels;
|
||||
}
|
||||
// Create repetition levels
|
||||
if (max_rep_level > 0 && num_levels != 0) {
|
||||
rep_levels.resize(num_levels);
|
||||
// Using a different seed so that def_levels and rep_levels are different.
|
||||
random_numbers(num_levels, seed + 789, zero, max_rep_level, rep_levels.data());
|
||||
// The generated levels are random. Force the very first page to start with a new
|
||||
// record.
|
||||
rep_levels[0] = 0;
|
||||
// For a null value, rep_levels and def_levels are both 0.
|
||||
// If we have a repeated value right after this, it needs to start with
|
||||
// rep_level = 0 to indicate a new record.
|
||||
for (int i = 0; i < num_levels - 1; ++i) {
|
||||
if (rep_levels[i] == 0 && def_levels[i] == 0) {
|
||||
rep_levels[i + 1] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Create values
|
||||
values.resize(num_values);
|
||||
if (encoding == Encoding::PLAIN) {
|
||||
InitValues<typename Type::c_type>(num_values, values, buffer);
|
||||
PaginatePlain<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
|
||||
levels_per_page, values_per_page, pages);
|
||||
} else if (encoding == Encoding::RLE_DICTIONARY ||
|
||||
encoding == Encoding::PLAIN_DICTIONARY) {
|
||||
// Calls InitValues and repeats the data
|
||||
InitDictValues<typename Type::c_type>(num_values, levels_per_page, values, buffer);
|
||||
PaginateDict<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
|
||||
levels_per_page, values_per_page, pages);
|
||||
}
|
||||
|
||||
return num_values;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Test data generation
|
||||
|
||||
template <>
|
||||
void inline InitValues<bool>(int num_values, uint32_t seed, std::vector<bool>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
values = {};
|
||||
if (seed == 0) {
|
||||
seed = static_cast<uint32_t>(::arrow::random_seed());
|
||||
}
|
||||
::arrow::random_is_valid(num_values, 0.5, &values, static_cast<int>(seed));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitValues<ByteArray>(int num_values, uint32_t seed,
|
||||
std::vector<ByteArray>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
int max_byte_array_len = 12;
|
||||
int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t));
|
||||
size_t nbytes = num_values * num_bytes;
|
||||
buffer.resize(nbytes);
|
||||
random_byte_array(num_values, seed, buffer.data(), values.data(), max_byte_array_len);
|
||||
}
|
||||
|
||||
inline void InitWideByteArrayValues(int num_values, std::vector<ByteArray>& values,
|
||||
std::vector<uint8_t>& buffer, int min_len,
|
||||
int max_len) {
|
||||
int num_bytes = static_cast<int>(max_len + sizeof(uint32_t));
|
||||
size_t nbytes = num_values * num_bytes;
|
||||
buffer.resize(nbytes);
|
||||
random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitValues<FLBA>(int num_values, uint32_t seed, std::vector<FLBA>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
size_t nbytes = num_values * FLBA_LENGTH;
|
||||
buffer.resize(nbytes);
|
||||
random_fixed_byte_array(num_values, seed, buffer.data(), FLBA_LENGTH, values.data());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void InitValues<Int96>(int num_values, uint32_t seed, std::vector<Int96>& values,
|
||||
std::vector<uint8_t>& buffer) {
|
||||
random_Int96_numbers(num_values, seed, std::numeric_limits<int32_t>::min(),
|
||||
std::numeric_limits<int32_t>::max(), values.data());
|
||||
}
|
||||
|
||||
inline std::string TestColumnName(int i) {
|
||||
std::stringstream col_name;
|
||||
col_name << "column_" << i;
|
||||
return col_name.str();
|
||||
}
|
||||
|
||||
// This class lives here because of its dependency on the InitValues specializations.
|
||||
template <typename TestType>
|
||||
class PrimitiveTypedTest : public ::testing::Test {
|
||||
public:
|
||||
using c_type = typename TestType::c_type;
|
||||
|
||||
virtual void SetUpSchema(Repetition::type repetition, int num_columns) {
|
||||
std::vector<schema::NodePtr> fields;
|
||||
|
||||
for (int i = 0; i < num_columns; ++i) {
|
||||
std::string name = TestColumnName(i);
|
||||
fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num,
|
||||
ConvertedType::NONE, FLBA_LENGTH));
|
||||
}
|
||||
node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
|
||||
schema_.Init(node_);
|
||||
}
|
||||
|
||||
void SetUpSchema(Repetition::type repetition) { this->SetUpSchema(repetition, 1); }
|
||||
|
||||
void GenerateData(int64_t num_values, uint32_t seed = 0);
|
||||
void SetupValuesOut(int64_t num_values);
|
||||
void SyncValuesOut();
|
||||
|
||||
protected:
|
||||
schema::NodePtr node_;
|
||||
SchemaDescriptor schema_;
|
||||
|
||||
// Input buffers
|
||||
std::vector<c_type> values_;
|
||||
|
||||
std::vector<int16_t> def_levels_;
|
||||
|
||||
std::vector<uint8_t> buffer_;
|
||||
// Pointer to the values, needed as we cannot use std::vector<bool>::data()
|
||||
c_type* values_ptr_;
|
||||
std::vector<uint8_t> bool_buffer_;
|
||||
|
||||
// Output buffers
|
||||
std::vector<c_type> values_out_;
|
||||
std::vector<uint8_t> bool_buffer_out_;
|
||||
c_type* values_out_ptr_;
|
||||
};
|
||||
|
||||
template <typename TestType>
|
||||
inline void PrimitiveTypedTest<TestType>::SyncValuesOut() {}
|
||||
|
||||
template <>
|
||||
inline void PrimitiveTypedTest<BooleanType>::SyncValuesOut() {
|
||||
std::vector<uint8_t>::const_iterator source_iterator = bool_buffer_out_.begin();
|
||||
std::vector<c_type>::iterator destination_iterator = values_out_.begin();
|
||||
while (source_iterator != bool_buffer_out_.end()) {
|
||||
*destination_iterator++ = *source_iterator++ != 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TestType>
|
||||
inline void PrimitiveTypedTest<TestType>::SetupValuesOut(int64_t num_values) {
|
||||
values_out_.clear();
|
||||
values_out_.resize(num_values);
|
||||
values_out_ptr_ = values_out_.data();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void PrimitiveTypedTest<BooleanType>::SetupValuesOut(int64_t num_values) {
|
||||
values_out_.clear();
|
||||
values_out_.resize(num_values);
|
||||
|
||||
bool_buffer_out_.clear();
|
||||
bool_buffer_out_.resize(num_values);
|
||||
// Write once to all values so we can copy it without getting Valgrind errors
|
||||
// about uninitialised values.
|
||||
std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true);
|
||||
values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data());
|
||||
}
|
||||
|
||||
template <typename TestType>
|
||||
inline void PrimitiveTypedTest<TestType>::GenerateData(int64_t num_values,
|
||||
uint32_t seed) {
|
||||
def_levels_.resize(num_values);
|
||||
values_.resize(num_values);
|
||||
|
||||
InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
|
||||
values_ptr_ = values_.data();
|
||||
|
||||
std::fill(def_levels_.begin(), def_levels_.end(), 1);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values,
|
||||
uint32_t seed) {
|
||||
def_levels_.resize(num_values);
|
||||
values_.resize(num_values);
|
||||
|
||||
InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
|
||||
bool_buffer_.resize(num_values);
|
||||
std::copy(values_.begin(), values_.end(), bool_buffer_.begin());
|
||||
values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data());
|
||||
|
||||
std::fill(def_levels_.begin(), def_levels_.end(), 1);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// test data generation
|
||||
|
||||
template <typename T>
|
||||
inline void GenerateData(int num_values, T* out, std::vector<uint8_t>* heap) {
|
||||
// seed the prng so failure is deterministic
|
||||
random_numbers(num_values, 0, std::numeric_limits<T>::min(),
|
||||
std::numeric_limits<T>::max(), out);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void GenerateBoundData(int num_values, T* out, T min, T max,
|
||||
std::vector<uint8_t>* heap) {
|
||||
// seed the prng so failure is deterministic
|
||||
random_numbers(num_values, 0, min, max, out);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void GenerateData<bool>(int num_values, bool* out, std::vector<uint8_t>* heap) {
|
||||
// seed the prng so failure is deterministic
|
||||
random_bools(num_values, 0.5, 0, out);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void GenerateData<Int96>(int num_values, Int96* out, std::vector<uint8_t>* heap) {
|
||||
// seed the prng so failure is deterministic
|
||||
random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
|
||||
std::numeric_limits<int32_t>::max(), out);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void GenerateData<ByteArray>(int num_values, ByteArray* out,
|
||||
std::vector<uint8_t>* heap) {
|
||||
int max_byte_array_len = 12;
|
||||
heap->resize(num_values * max_byte_array_len);
|
||||
// seed the prng so failure is deterministic
|
||||
random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
|
||||
}
|
||||
|
||||
// Generate ByteArray or FLBA data where there is a given probability
|
||||
// for each value to share a common prefix with its predecessor.
|
||||
// This is useful to exercise prefix-based encodings such as DELTA_BYTE_ARRAY.
|
||||
template <typename T>
|
||||
inline void GeneratePrefixedData(int num_values, T* out, std::vector<uint8_t>* heap,
|
||||
double prefixed_probability);
|
||||
|
||||
template <>
|
||||
inline void GeneratePrefixedData(int num_values, ByteArray* out,
|
||||
std::vector<uint8_t>* heap,
|
||||
double prefixed_probability) {
|
||||
int max_byte_array_len = 12;
|
||||
heap->resize(num_values * max_byte_array_len);
|
||||
// seed the prng so failure is deterministic
|
||||
prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), out, /*min_size=*/2,
|
||||
/*max_size=*/max_byte_array_len, prefixed_probability);
|
||||
}
|
||||
|
||||
static constexpr int kGenerateDataFLBALength = 8;
|
||||
|
||||
template <>
|
||||
inline void GeneratePrefixedData<FLBA>(int num_values, FLBA* out,
|
||||
std::vector<uint8_t>* heap,
|
||||
double prefixed_probability) {
|
||||
heap->resize(num_values * kGenerateDataFLBALength);
|
||||
// seed the prng so failure is deterministic
|
||||
prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(),
|
||||
kGenerateDataFLBALength, out, prefixed_probability);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>* heap) {
|
||||
heap->resize(num_values * kGenerateDataFLBALength);
|
||||
// seed the prng so failure is deterministic
|
||||
random_fixed_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Test utility functions for geometry
|
||||
|
||||
#if defined(ARROW_LITTLE_ENDIAN)
|
||||
static constexpr uint8_t kWkbNativeEndianness = 0x01;
|
||||
#else
|
||||
static constexpr uint8_t kWkbNativeEndianness = 0x00;
|
||||
#endif
|
||||
|
||||
/// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian,
|
||||
/// uint32_t geometry type, 2 * double coordinates)
|
||||
static constexpr int kWkbPointXYSize = 21;
|
||||
|
||||
std::string MakeWKBPoint(const std::vector<double>& xyzm, bool has_z, bool has_m);
|
||||
|
||||
std::optional<std::pair<double, double>> GetWKBPointCoordinateXY(const ByteArray& value);
|
||||
|
||||
// A minimal version of a geoarrow.wkb extension type to test interoperability
|
||||
class GeoArrowWkbExtensionType : public ::arrow::ExtensionType {
|
||||
public:
|
||||
explicit GeoArrowWkbExtensionType(std::shared_ptr<::arrow::DataType> storage_type,
|
||||
std::string metadata)
|
||||
: ::arrow::ExtensionType(std::move(storage_type)), metadata_(std::move(metadata)) {}
|
||||
|
||||
std::string extension_name() const override { return "geoarrow.wkb"; }
|
||||
|
||||
std::string Serialize() const override { return metadata_; }
|
||||
|
||||
::arrow::Result<std::shared_ptr<::arrow::DataType>> Deserialize(
|
||||
std::shared_ptr<::arrow::DataType> storage_type,
|
||||
const std::string& serialized_data) const override {
|
||||
return std::make_shared<GeoArrowWkbExtensionType>(std::move(storage_type),
|
||||
serialized_data);
|
||||
}
|
||||
|
||||
std::shared_ptr<::arrow::Array> MakeArray(
|
||||
std::shared_ptr<::arrow::ArrayData> data) const override {
|
||||
return std::make_shared<::arrow::ExtensionArray>(data);
|
||||
}
|
||||
|
||||
bool ExtensionEquals(const ExtensionType& other) const override {
|
||||
return other.extension_name() == extension_name() && other.Serialize() == Serialize();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string metadata_;
|
||||
};
|
||||
|
||||
std::shared_ptr<::arrow::DataType> geoarrow_wkb(
|
||||
std::string metadata = "{}",
|
||||
const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
|
||||
|
||||
std::shared_ptr<::arrow::DataType> geoarrow_wkb_lonlat(
|
||||
const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
|
||||
|
||||
} // namespace test
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,105 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace parquet {
|
||||
|
||||
/// \brief Feature selection when writing Parquet files
|
||||
///
|
||||
/// `ParquetVersion::type` governs which data types are allowed and how they
|
||||
/// are represented. For example, uint32_t data will be written differently
|
||||
/// depending on this value (as INT64 for PARQUET_1_0, as UINT32 for other
|
||||
/// versions).
|
||||
///
|
||||
/// However, some features - such as compression algorithms, encryption,
|
||||
/// or the improved "v2" data page format - must be enabled separately in
|
||||
/// ArrowWriterProperties.
|
||||
struct ParquetVersion {
|
||||
enum type : int {
|
||||
/// Enable only pre-2.2 Parquet format features when writing
|
||||
///
|
||||
/// This setting is useful for maximum compatibility with legacy readers.
|
||||
/// Note that logical types may still be emitted, as long they have a
|
||||
/// corresponding converted type.
|
||||
PARQUET_1_0,
|
||||
|
||||
/// Enable Parquet format 2.4 and earlier features when writing
|
||||
///
|
||||
/// This enables UINT32 as well as logical types which don't have
|
||||
/// a corresponding converted type.
|
||||
///
|
||||
/// Note: Parquet format 2.4.0 was released in October 2017.
|
||||
PARQUET_2_4,
|
||||
|
||||
/// Enable Parquet format 2.6 and earlier features when writing
|
||||
///
|
||||
/// This enables the NANOS time unit in addition to the PARQUET_2_4
|
||||
/// features.
|
||||
///
|
||||
/// Note: Parquet format 2.6.0 was released in September 2018.
|
||||
PARQUET_2_6,
|
||||
|
||||
/// Enable latest Parquet format 2.x features
|
||||
///
|
||||
/// This value is equal to the greatest 2.x version supported by
|
||||
/// this library.
|
||||
PARQUET_2_LATEST = PARQUET_2_6
|
||||
};
|
||||
};
|
||||
|
||||
struct PageIndexLocation;
|
||||
|
||||
class FileMetaData;
|
||||
class FileCryptoMetaData;
|
||||
class RowGroupMetaData;
|
||||
|
||||
class ColumnDescriptor;
|
||||
class SchemaDescriptor;
|
||||
|
||||
class ReaderProperties;
|
||||
class ArrowReaderProperties;
|
||||
|
||||
class WriterProperties;
|
||||
class WriterPropertiesBuilder;
|
||||
class ArrowWriterProperties;
|
||||
class ArrowWriterPropertiesBuilder;
|
||||
|
||||
class EncodedStatistics;
|
||||
class Statistics;
|
||||
struct SizeStatistics;
|
||||
|
||||
namespace geospatial {
|
||||
class GeoStatistics;
|
||||
struct EncodedGeoStatistics;
|
||||
} // namespace geospatial
|
||||
|
||||
class ColumnIndex;
|
||||
class OffsetIndex;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class FileWriter;
|
||||
class FileReader;
|
||||
|
||||
} // namespace arrow
|
||||
|
||||
namespace schema {
|
||||
class ColumnPath;
|
||||
} // namespace schema
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,883 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/type_fwd.h"
|
||||
#include "parquet/windows_fixup.h" // for OPTIONAL
|
||||
|
||||
namespace arrow::util {
|
||||
|
||||
class Codec;
|
||||
|
||||
} // namespace arrow::util
|
||||
|
||||
namespace parquet {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Metadata enums to match Thrift metadata
|
||||
//
|
||||
// The reason we maintain our own enums is to avoid transitive dependency on
|
||||
// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
|
||||
// public API. After building parquet-cpp, you should not need to include
|
||||
// Thrift headers in your application. This means some boilerplate to convert
|
||||
// between our types and Parquet's Thrift types.
|
||||
//
|
||||
// We can also add special values like NONE to distinguish between metadata
|
||||
// values being set and not set. As an example consider ConvertedType and
|
||||
// CompressionCodec
|
||||
|
||||
// Mirrors parquet::Type
|
||||
struct Type {
|
||||
enum type {
|
||||
BOOLEAN = 0,
|
||||
INT32 = 1,
|
||||
INT64 = 2,
|
||||
INT96 = 3,
|
||||
FLOAT = 4,
|
||||
DOUBLE = 5,
|
||||
BYTE_ARRAY = 6,
|
||||
FIXED_LEN_BYTE_ARRAY = 7,
|
||||
// Should always be last element.
|
||||
UNDEFINED = 8
|
||||
};
|
||||
};
|
||||
|
||||
// Mirrors parquet::ConvertedType
|
||||
struct ConvertedType {
|
||||
enum type {
|
||||
NONE, // Not a real converted type, but means no converted type is specified
|
||||
UTF8,
|
||||
MAP,
|
||||
MAP_KEY_VALUE,
|
||||
LIST,
|
||||
ENUM,
|
||||
DECIMAL,
|
||||
DATE,
|
||||
TIME_MILLIS,
|
||||
TIME_MICROS,
|
||||
TIMESTAMP_MILLIS,
|
||||
TIMESTAMP_MICROS,
|
||||
UINT_8,
|
||||
UINT_16,
|
||||
UINT_32,
|
||||
UINT_64,
|
||||
INT_8,
|
||||
INT_16,
|
||||
INT_32,
|
||||
INT_64,
|
||||
JSON,
|
||||
BSON,
|
||||
INTERVAL,
|
||||
// DEPRECATED INVALID ConvertedType for all-null data.
|
||||
// Only useful for reading legacy files written out by interim Parquet C++ releases.
|
||||
// For writing, always emit LogicalType::Null instead.
|
||||
// See PARQUET-1990.
|
||||
NA = 25,
|
||||
UNDEFINED = 26 // Not a real converted type; should always be last element
|
||||
};
|
||||
};
|
||||
|
||||
// forward declaration
|
||||
namespace format {
|
||||
|
||||
class LogicalType;
|
||||
|
||||
}
|
||||
|
||||
// Mirrors parquet::FieldRepetitionType
|
||||
struct Repetition {
|
||||
enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
|
||||
};
|
||||
|
||||
// Reference:
|
||||
// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
|
||||
// format/converter/ParquetMetadataConverter.java
|
||||
// Sort order for page and column statistics. Types are associated with sort
|
||||
// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
|
||||
// aggregated using a sort order. As of parquet-format version 2.3.1, the
|
||||
// order used to aggregate stats is always SIGNED and is not stored in the
|
||||
// Parquet file. These stats are discarded for types that need unsigned.
|
||||
// See PARQUET-686.
|
||||
struct SortOrder {
|
||||
enum type { SIGNED, UNSIGNED, UNKNOWN };
|
||||
};
|
||||
|
||||
namespace schema {
|
||||
|
||||
struct DecimalMetadata {
|
||||
bool isset;
|
||||
int32_t scale;
|
||||
int32_t precision;
|
||||
};
|
||||
|
||||
} // namespace schema
|
||||
|
||||
/// \brief Implementation of parquet.thrift LogicalType types.
|
||||
class PARQUET_EXPORT LogicalType {
|
||||
public:
|
||||
struct Type {
|
||||
enum type {
|
||||
UNDEFINED = 0, // Not a real logical type
|
||||
STRING = 1,
|
||||
MAP,
|
||||
LIST,
|
||||
ENUM,
|
||||
DECIMAL,
|
||||
DATE,
|
||||
TIME,
|
||||
TIMESTAMP,
|
||||
INTERVAL,
|
||||
INT,
|
||||
NIL, // Thrift NullType: annotates data that is always null
|
||||
JSON,
|
||||
BSON,
|
||||
UUID,
|
||||
FLOAT16,
|
||||
GEOMETRY,
|
||||
GEOGRAPHY,
|
||||
VARIANT,
|
||||
NONE // Not a real logical type; should always be last element
|
||||
};
|
||||
};
|
||||
|
||||
struct TimeUnit {
|
||||
enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
|
||||
};
|
||||
|
||||
enum class EdgeInterpolationAlgorithm {
|
||||
UNKNOWN = 0,
|
||||
SPHERICAL = 1,
|
||||
VINCENTY = 2,
|
||||
THOMAS = 3,
|
||||
ANDOYER = 4,
|
||||
KARNEY = 5
|
||||
};
|
||||
|
||||
/// \brief The latest supported Variant specification version by this library
|
||||
static constexpr int8_t kVariantSpecVersion = 1;
|
||||
|
||||
/// \brief If possible, return a logical type equivalent to the given legacy
|
||||
/// converted type (and decimal metadata if applicable).
|
||||
static std::shared_ptr<const LogicalType> FromConvertedType(
|
||||
const parquet::ConvertedType::type converted_type,
|
||||
const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
|
||||
-1});
|
||||
|
||||
/// \brief Return the logical type represented by the Thrift intermediary object.
|
||||
static std::shared_ptr<const LogicalType> FromThrift(
|
||||
const parquet::format::LogicalType& thrift_logical_type);
|
||||
|
||||
/// \brief Return the explicitly requested logical type.
|
||||
static std::shared_ptr<const LogicalType> String();
|
||||
static std::shared_ptr<const LogicalType> Map();
|
||||
static std::shared_ptr<const LogicalType> List();
|
||||
static std::shared_ptr<const LogicalType> Enum();
|
||||
static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
|
||||
static std::shared_ptr<const LogicalType> Date();
|
||||
static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
|
||||
LogicalType::TimeUnit::unit time_unit);
|
||||
|
||||
/// \brief Create a Timestamp logical type
|
||||
/// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
|
||||
/// \param[in] time_unit the resolution of the timestamp
|
||||
/// \param[in] is_from_converted_type if true, the timestamp was generated
|
||||
/// by translating a legacy converted type of TIMESTAMP_MILLIS or
|
||||
/// TIMESTAMP_MICROS. Default is false.
|
||||
/// \param[in] force_set_converted_type if true, always set the
|
||||
/// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
|
||||
/// metadata. Default is false
|
||||
static std::shared_ptr<const LogicalType> Timestamp(
|
||||
bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
|
||||
bool is_from_converted_type = false, bool force_set_converted_type = false);
|
||||
|
||||
static std::shared_ptr<const LogicalType> Interval();
|
||||
static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
|
||||
|
||||
/// \brief Create a logical type for data that's always null
|
||||
///
|
||||
/// Any physical type can be annotated with this logical type.
|
||||
static std::shared_ptr<const LogicalType> Null();
|
||||
|
||||
static std::shared_ptr<const LogicalType> JSON();
|
||||
static std::shared_ptr<const LogicalType> BSON();
|
||||
static std::shared_ptr<const LogicalType> UUID();
|
||||
static std::shared_ptr<const LogicalType> Float16();
|
||||
static std::shared_ptr<const LogicalType> Variant(
|
||||
int8_t specVersion = kVariantSpecVersion);
|
||||
|
||||
static std::shared_ptr<const LogicalType> Geometry(std::string crs = "");
|
||||
|
||||
static std::shared_ptr<const LogicalType> Geography(
|
||||
std::string crs = "", LogicalType::EdgeInterpolationAlgorithm algorithm =
|
||||
EdgeInterpolationAlgorithm::SPHERICAL);
|
||||
|
||||
/// \brief Create a placeholder for when no logical type is specified
|
||||
static std::shared_ptr<const LogicalType> None();
|
||||
|
||||
/// \brief Return true if this logical type is consistent with the given underlying
|
||||
/// physical type.
|
||||
bool is_applicable(parquet::Type::type primitive_type,
|
||||
int32_t primitive_length = -1) const;
|
||||
|
||||
/// \brief Return true if this logical type is equivalent to the given legacy converted
|
||||
/// type (and decimal metadata if applicable).
|
||||
bool is_compatible(parquet::ConvertedType::type converted_type,
|
||||
parquet::schema::DecimalMetadata converted_decimal_metadata = {
|
||||
false, -1, -1}) const;
|
||||
|
||||
/// \brief If possible, return the legacy converted type (and decimal metadata if
|
||||
/// applicable) equivalent to this logical type.
|
||||
parquet::ConvertedType::type ToConvertedType(
|
||||
parquet::schema::DecimalMetadata* out_decimal_metadata) const;
|
||||
|
||||
/// \brief Return a printable representation of this logical type.
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Return a JSON representation of this logical type.
|
||||
std::string ToJSON() const;
|
||||
|
||||
/// \brief Return a serializable Thrift object for this logical type.
|
||||
parquet::format::LogicalType ToThrift() const;
|
||||
|
||||
/// \brief Return true if the given logical type is equivalent to this logical type.
|
||||
bool Equals(const LogicalType& other) const;
|
||||
|
||||
/// \brief Return the enumerated type of this logical type.
|
||||
LogicalType::Type::type type() const;
|
||||
|
||||
/// \brief Return the appropriate sort order for this logical type.
|
||||
SortOrder::type sort_order() const;
|
||||
|
||||
// Type checks ...
|
||||
bool is_string() const;
|
||||
bool is_map() const;
|
||||
bool is_list() const;
|
||||
bool is_enum() const;
|
||||
bool is_decimal() const;
|
||||
bool is_date() const;
|
||||
bool is_time() const;
|
||||
bool is_timestamp() const;
|
||||
bool is_interval() const;
|
||||
bool is_int() const;
|
||||
bool is_null() const;
|
||||
bool is_JSON() const;
|
||||
bool is_BSON() const;
|
||||
bool is_UUID() const;
|
||||
bool is_float16() const;
|
||||
bool is_geometry() const;
|
||||
bool is_geography() const;
|
||||
bool is_variant() const;
|
||||
bool is_none() const;
|
||||
/// \brief Return true if this logical type is of a known type.
|
||||
bool is_valid() const;
|
||||
bool is_invalid() const;
|
||||
/// \brief Return true if this logical type is suitable for a schema GroupNode.
|
||||
bool is_nested() const;
|
||||
bool is_nonnested() const;
|
||||
/// \brief Return true if this logical type is included in the Thrift output for its
|
||||
/// node.
|
||||
bool is_serialized() const;
|
||||
|
||||
LogicalType(const LogicalType&) = delete;
|
||||
LogicalType& operator=(const LogicalType&) = delete;
|
||||
virtual ~LogicalType() noexcept;
|
||||
|
||||
protected:
|
||||
LogicalType();
|
||||
|
||||
class Impl;
|
||||
std::unique_ptr<const Impl> impl_;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
|
||||
class PARQUET_EXPORT StringLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
StringLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for group nodes only.
|
||||
class PARQUET_EXPORT MapLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
MapLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for group nodes only.
|
||||
class PARQUET_EXPORT ListLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
ListLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
|
||||
class PARQUET_EXPORT EnumLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
EnumLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
|
||||
/// depending on the precision.
|
||||
class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
|
||||
int32_t precision() const;
|
||||
int32_t scale() const;
|
||||
|
||||
private:
|
||||
DecimalLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32.
|
||||
class PARQUET_EXPORT DateLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
DateLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
|
||||
class PARQUET_EXPORT TimeLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
|
||||
LogicalType::TimeUnit::unit time_unit);
|
||||
bool is_adjusted_to_utc() const;
|
||||
LogicalType::TimeUnit::unit time_unit() const;
|
||||
|
||||
private:
|
||||
TimeLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT64.
|
||||
class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
|
||||
LogicalType::TimeUnit::unit time_unit,
|
||||
bool is_from_converted_type = false,
|
||||
bool force_set_converted_type = false);
|
||||
bool is_adjusted_to_utc() const;
|
||||
LogicalType::TimeUnit::unit time_unit() const;
|
||||
|
||||
/// \brief If true, will not set LogicalType in Thrift metadata
|
||||
bool is_from_converted_type() const;
|
||||
|
||||
/// \brief If true, will set ConvertedType for micros and millis
|
||||
/// resolution in legacy ConvertedType Thrift metadata
|
||||
bool force_set_converted_type() const;
|
||||
|
||||
private:
|
||||
TimestampLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
|
||||
class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
IntervalLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
|
||||
/// (for bit width 64).
|
||||
class PARQUET_EXPORT IntLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
|
||||
int bit_width() const;
|
||||
bool is_signed() const;
|
||||
|
||||
private:
|
||||
IntLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for any physical type.
|
||||
class PARQUET_EXPORT NullLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
NullLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY.
|
||||
class PARQUET_EXPORT JSONLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
JSONLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type BYTE_ARRAY.
|
||||
class PARQUET_EXPORT BSONLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
BSONLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
|
||||
/// must encode raw UUID bytes.
|
||||
class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
UUIDLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 2,
|
||||
/// must encode raw FLOAT16 bytes.
|
||||
class PARQUET_EXPORT Float16LogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
Float16LogicalType() = default;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT GeometryLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(std::string crs = "");
|
||||
|
||||
const std::string& crs() const;
|
||||
|
||||
private:
|
||||
GeometryLogicalType() = default;
|
||||
};
|
||||
|
||||
class PARQUET_EXPORT GeographyLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(
|
||||
std::string crs = "", LogicalType::EdgeInterpolationAlgorithm algorithm =
|
||||
EdgeInterpolationAlgorithm::SPHERICAL);
|
||||
|
||||
const std::string& crs() const;
|
||||
LogicalType::EdgeInterpolationAlgorithm algorithm() const;
|
||||
std::string_view algorithm_name() const;
|
||||
|
||||
private:
|
||||
GeographyLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for group nodes only.
|
||||
class PARQUET_EXPORT VariantLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make(
|
||||
int8_t specVersion = kVariantSpecVersion);
|
||||
|
||||
int8_t spec_version() const;
|
||||
|
||||
private:
|
||||
VariantLogicalType() = default;
|
||||
};
|
||||
|
||||
/// \brief Allowed for any physical type.
|
||||
class PARQUET_EXPORT NoLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
NoLogicalType() = default;
|
||||
};
|
||||
|
||||
// Internal API, for unrecognized logical types
|
||||
class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
|
||||
public:
|
||||
static std::shared_ptr<const LogicalType> Make();
|
||||
|
||||
private:
|
||||
UndefinedLogicalType() = default;
|
||||
};
|
||||
|
||||
// Data encodings. Mirrors parquet::Encoding
|
||||
struct Encoding {
|
||||
enum type {
|
||||
PLAIN = 0,
|
||||
PLAIN_DICTIONARY = 2,
|
||||
RLE = 3,
|
||||
BIT_PACKED = 4,
|
||||
DELTA_BINARY_PACKED = 5,
|
||||
DELTA_LENGTH_BYTE_ARRAY = 6,
|
||||
DELTA_BYTE_ARRAY = 7,
|
||||
RLE_DICTIONARY = 8,
|
||||
BYTE_STREAM_SPLIT = 9,
|
||||
// Should always be last element (except UNKNOWN)
|
||||
UNDEFINED = 10,
|
||||
UNKNOWN = 999
|
||||
};
|
||||
};
|
||||
|
||||
// Exposed data encodings. It is the encoding of the data read from the file,
|
||||
// rather than the encoding of the data in the file. E.g., the data encoded as
|
||||
// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
|
||||
// decoding, in which case the data read from the file is DICTIONARY encoded.
|
||||
enum class ExposedEncoding {
|
||||
NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
|
||||
DICTIONARY = 1
|
||||
};
|
||||
|
||||
/// \brief Return true if Parquet supports indicated compression type
|
||||
PARQUET_EXPORT
|
||||
bool IsCodecSupported(Compression::type codec);
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Codec> GetCodec(Compression::type codec);
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Codec> GetCodec(Compression::type codec,
|
||||
const CodecOptions& codec_options);
|
||||
|
||||
PARQUET_EXPORT
|
||||
std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
|
||||
|
||||
struct ParquetCipher {
|
||||
enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
|
||||
};
|
||||
|
||||
struct AadMetadata {
|
||||
std::string aad_prefix;
|
||||
std::string aad_file_unique;
|
||||
bool supply_aad_prefix;
|
||||
};
|
||||
|
||||
struct EncryptionAlgorithm {
|
||||
ParquetCipher::type algorithm;
|
||||
AadMetadata aad;
|
||||
};
|
||||
|
||||
// parquet::PageType
|
||||
struct PageType {
|
||||
enum type {
|
||||
DATA_PAGE,
|
||||
INDEX_PAGE,
|
||||
DICTIONARY_PAGE,
|
||||
DATA_PAGE_V2,
|
||||
// Should always be last element
|
||||
UNDEFINED
|
||||
};
|
||||
};
|
||||
|
||||
bool PageCanUseChecksum(PageType::type pageType);
|
||||
|
||||
class ColumnOrder {
|
||||
public:
|
||||
enum type { UNDEFINED, TYPE_DEFINED_ORDER };
|
||||
explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
|
||||
// Default to Type Defined Order
|
||||
ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
|
||||
ColumnOrder::type get_order() { return column_order_; }
|
||||
|
||||
static ColumnOrder undefined_;
|
||||
static ColumnOrder type_defined_;
|
||||
|
||||
private:
|
||||
ColumnOrder::type column_order_;
|
||||
};
|
||||
|
||||
/// \brief BoundaryOrder is a proxy around format::BoundaryOrder.
|
||||
struct BoundaryOrder {
|
||||
enum type {
|
||||
Unordered = 0,
|
||||
Ascending = 1,
|
||||
Descending = 2,
|
||||
// Should always be last element
|
||||
UNDEFINED = 3
|
||||
};
|
||||
};
|
||||
|
||||
/// \brief SortingColumn is a proxy around format::SortingColumn.
|
||||
struct PARQUET_EXPORT SortingColumn {
|
||||
// The column index (in this row group)
|
||||
int32_t column_idx;
|
||||
|
||||
// If true, indicates this column is sorted in descending order.
|
||||
bool descending;
|
||||
|
||||
// If true, nulls will come before non-null values, otherwise, nulls go at the end.
|
||||
bool nulls_first;
|
||||
};
|
||||
|
||||
inline bool operator==(const SortingColumn& left, const SortingColumn& right) {
|
||||
return left.nulls_first == right.nulls_first && left.descending == right.descending &&
|
||||
left.column_idx == right.column_idx;
|
||||
}
|
||||
|
||||
inline bool operator!=(const SortingColumn& left, const SortingColumn& right) {
|
||||
return !(left == right);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
struct ByteArray {
|
||||
ByteArray() : len(0), ptr(NULLPTR) {}
|
||||
ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
|
||||
|
||||
ByteArray(::std::string_view view) // NOLINT implicit conversion
|
||||
: ByteArray(static_cast<uint32_t>(view.size()),
|
||||
reinterpret_cast<const uint8_t*>(view.data())) {}
|
||||
|
||||
explicit operator std::string_view() const {
|
||||
return std::string_view{reinterpret_cast<const char*>(ptr), len};
|
||||
}
|
||||
|
||||
uint32_t len;
|
||||
const uint8_t* ptr;
|
||||
};
|
||||
|
||||
inline bool operator==(const ByteArray& left, const ByteArray& right) {
|
||||
return left.len == right.len &&
|
||||
(left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
|
||||
}
|
||||
|
||||
inline bool operator!=(const ByteArray& left, const ByteArray& right) {
|
||||
return !(left == right);
|
||||
}
|
||||
|
||||
struct FixedLenByteArray {
|
||||
FixedLenByteArray() : ptr(NULLPTR) {}
|
||||
explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
|
||||
const uint8_t* ptr;
|
||||
};
|
||||
|
||||
using FLBA = FixedLenByteArray;
|
||||
|
||||
// Julian day at unix epoch.
|
||||
//
|
||||
// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
|
||||
// the Julian day count starting from noon Universal time, with Julian day
|
||||
// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
|
||||
// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
|
||||
// calendar),
|
||||
constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
|
||||
constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
|
||||
constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
|
||||
constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
|
||||
constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
|
||||
|
||||
MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
|
||||
STRUCT_END(Int96, 12);
|
||||
|
||||
inline bool operator==(const Int96& left, const Int96& right) {
|
||||
return std::equal(left.value, left.value + 3, right.value);
|
||||
}
|
||||
|
||||
inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
|
||||
|
||||
static inline std::string ByteArrayToString(const ByteArray& a) {
|
||||
return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
|
||||
}
|
||||
|
||||
static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
|
||||
std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
|
||||
}
|
||||
|
||||
struct DecodedInt96 {
|
||||
uint64_t days_since_epoch;
|
||||
uint64_t nanoseconds;
|
||||
};
|
||||
|
||||
static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
|
||||
// We do the computations in the unsigned domain to avoid unsigned behaviour
|
||||
// on overflow.
|
||||
DecodedInt96 result;
|
||||
result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
|
||||
result.nanoseconds = 0;
|
||||
|
||||
memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
|
||||
decoded.nanoseconds);
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
|
||||
microseconds);
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
|
||||
milliseconds);
|
||||
}
|
||||
|
||||
static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
|
||||
const auto decoded = DecodeInt96Timestamp(i96);
|
||||
uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
|
||||
return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
|
||||
}
|
||||
|
||||
static inline std::string Int96ToString(const Int96& a) {
|
||||
std::ostringstream result;
|
||||
std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
|
||||
return result.str();
|
||||
}
|
||||
|
||||
static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
|
||||
std::ostringstream result;
|
||||
std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
|
||||
return result.str();
|
||||
}
|
||||
|
||||
template <Type::type TYPE>
|
||||
struct type_traits {};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::BOOLEAN> {
|
||||
using value_type = bool;
|
||||
|
||||
static constexpr int value_byte_size = 1;
|
||||
static constexpr const char* printf_code = "d";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::INT32> {
|
||||
using value_type = int32_t;
|
||||
|
||||
static constexpr int value_byte_size = 4;
|
||||
static constexpr const char* printf_code = "d";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::INT64> {
|
||||
using value_type = int64_t;
|
||||
|
||||
static constexpr int value_byte_size = 8;
|
||||
static constexpr const char* printf_code =
|
||||
(sizeof(long) == 64) ? "ld" : "lld"; // NOLINT: runtime/int
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::INT96> {
|
||||
using value_type = Int96;
|
||||
|
||||
static constexpr int value_byte_size = 12;
|
||||
static constexpr const char* printf_code = "s";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::FLOAT> {
|
||||
using value_type = float;
|
||||
|
||||
static constexpr int value_byte_size = 4;
|
||||
static constexpr const char* printf_code = "f";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::DOUBLE> {
|
||||
using value_type = double;
|
||||
|
||||
static constexpr int value_byte_size = 8;
|
||||
static constexpr const char* printf_code = "lf";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::BYTE_ARRAY> {
|
||||
using value_type = ByteArray;
|
||||
|
||||
static constexpr int value_byte_size = sizeof(ByteArray);
|
||||
static constexpr const char* printf_code = "s";
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
|
||||
using value_type = FixedLenByteArray;
|
||||
|
||||
static constexpr int value_byte_size = sizeof(FixedLenByteArray);
|
||||
static constexpr const char* printf_code = "s";
|
||||
};
|
||||
|
||||
template <Type::type TYPE>
|
||||
struct PhysicalType {
|
||||
using c_type = typename type_traits<TYPE>::value_type;
|
||||
static constexpr Type::type type_num = TYPE;
|
||||
};
|
||||
|
||||
using BooleanType = PhysicalType<Type::BOOLEAN>;
|
||||
using Int32Type = PhysicalType<Type::INT32>;
|
||||
using Int64Type = PhysicalType<Type::INT64>;
|
||||
using Int96Type = PhysicalType<Type::INT96>;
|
||||
using FloatType = PhysicalType<Type::FLOAT>;
|
||||
using DoubleType = PhysicalType<Type::DOUBLE>;
|
||||
using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
|
||||
using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
|
||||
|
||||
template <typename Type>
|
||||
inline std::string format_fwf(int width) {
|
||||
std::stringstream ss;
|
||||
ss << "%-" << width << type_traits<Type::type_num>::printf_code;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
|
||||
|
||||
PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
|
||||
|
||||
PARQUET_EXPORT std::string TypeToString(Type::type t);
|
||||
|
||||
PARQUET_EXPORT std::string TypeToString(Type::type t, int type_length);
|
||||
|
||||
PARQUET_EXPORT std::string FormatStatValue(
|
||||
Type::type parquet_type, ::std::string_view val,
|
||||
const std::shared_ptr<const LogicalType>& logical_type = NULLPTR);
|
||||
|
||||
PARQUET_EXPORT int GetTypeByteSize(Type::type t);
|
||||
|
||||
PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
|
||||
|
||||
PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
|
||||
Type::type primitive);
|
||||
|
||||
PARQUET_EXPORT SortOrder::type GetSortOrder(
|
||||
const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
|
||||
|
||||
// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
|
||||
// encoding.
|
||||
constexpr bool IsDictionaryIndexEncoding(Encoding::type e) {
|
||||
return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
|
||||
}
|
||||
|
||||
} // namespace parquet
|
||||
@@ -0,0 +1,21 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/util/windows_compatibility.h"
|
||||
#include "parquet/windows_fixup.h"
|
||||
@@ -0,0 +1,29 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This header needs to be included multiple times.
|
||||
|
||||
#include "arrow/util/windows_fixup.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
// parquet.thrift's OPTIONAL RepetitionType conflicts with a Windows #define
|
||||
# ifdef OPTIONAL
|
||||
# undef OPTIONAL
|
||||
# endif
|
||||
|
||||
#endif // _WIN32
|
||||
@@ -0,0 +1,50 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "parquet/hasher.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/types.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class PARQUET_EXPORT XxHasher : public Hasher {
|
||||
public:
|
||||
uint64_t Hash(int32_t value) const override;
|
||||
uint64_t Hash(int64_t value) const override;
|
||||
uint64_t Hash(float value) const override;
|
||||
uint64_t Hash(double value) const override;
|
||||
uint64_t Hash(const Int96* value) const override;
|
||||
uint64_t Hash(const ByteArray* value) const override;
|
||||
uint64_t Hash(const FLBA* val, uint32_t len) const override;
|
||||
|
||||
void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override;
|
||||
void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override;
|
||||
void Hashes(const float* values, int num_values, uint64_t* hashes) const override;
|
||||
void Hashes(const double* values, int num_values, uint64_t* hashes) const override;
|
||||
void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override;
|
||||
void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override;
|
||||
void Hashes(const FLBA* values, uint32_t type_len, int num_values,
|
||||
uint64_t* hashes) const override;
|
||||
|
||||
static constexpr int kParquetBloomXxHashSeed = 0;
|
||||
};
|
||||
|
||||
} // namespace parquet
|
||||
Reference in New Issue
Block a user