Initial commit
This commit is contained in:
@@ -0,0 +1,184 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
#include "parquet/level_conversion.h"
|
||||
#include "parquet/platform.h"
|
||||
#include "parquet/schema.h"
|
||||
|
||||
namespace parquet {
|
||||
|
||||
class ArrowReaderProperties;
|
||||
class ArrowWriterProperties;
|
||||
class WriterProperties;
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
|
||||
/// schema into a Parquet schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
schema::NodePtr* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
const ArrowWriterProperties& arrow_properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
|
||||
const WriterProperties& properties,
|
||||
std::shared_ptr<SchemaDescriptor>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
|
||||
/// schema into an Arrow schema.
|
||||
///
|
||||
/// @{
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(
|
||||
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
const ArrowReaderProperties& properties,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
PARQUET_EXPORT
|
||||
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
|
||||
std::shared_ptr<::arrow::Schema>* out);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Bridge between an arrow::Field and parquet column indices.
|
||||
struct PARQUET_EXPORT SchemaField {
|
||||
std::shared_ptr<::arrow::Field> field;
|
||||
std::vector<SchemaField> children;
|
||||
|
||||
// Only set for leaf nodes
|
||||
int column_index = -1;
|
||||
|
||||
parquet::internal::LevelInfo level_info;
|
||||
|
||||
bool is_leaf() const { return column_index != -1; }
|
||||
};
|
||||
|
||||
/// \brief Bridge between a parquet Schema and an arrow Schema.
|
||||
///
|
||||
/// Expose parquet columns as a tree structure. Useful traverse and link
|
||||
/// between arrow's Schema and parquet's Schema.
|
||||
struct PARQUET_EXPORT SchemaManifest {
|
||||
static ::arrow::Status Make(
|
||||
const SchemaDescriptor* schema,
|
||||
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
|
||||
const ArrowReaderProperties& properties, SchemaManifest* manifest);
|
||||
|
||||
const SchemaDescriptor* descr;
|
||||
std::shared_ptr<::arrow::Schema> origin_schema;
|
||||
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
|
||||
std::vector<SchemaField> schema_fields;
|
||||
|
||||
std::unordered_map<int, const SchemaField*> column_index_to_field;
|
||||
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
|
||||
|
||||
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
|
||||
auto it = column_index_to_field.find(column_index);
|
||||
if (it == column_index_to_field.end()) {
|
||||
return ::arrow::Status::KeyError("Column index ", column_index,
|
||||
" not found in schema manifest, may be malformed");
|
||||
}
|
||||
*out = it->second;
|
||||
return ::arrow::Status::OK();
|
||||
}
|
||||
|
||||
const SchemaField* GetParent(const SchemaField* field) const {
|
||||
// Returns nullptr also if not found
|
||||
auto it = child_to_parent.find(field);
|
||||
if (it == child_to_parent.end()) {
|
||||
return NULLPTR;
|
||||
}
|
||||
return it->second;
|
||||
}
|
||||
|
||||
/// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
|
||||
/// correspond to the column root (first node below the parquet schema's root group) of
|
||||
/// each leaf referenced in column_indices.
|
||||
///
|
||||
/// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
|
||||
/// the roots are `a` and `i` (return=[0,2]).
|
||||
///
|
||||
/// root
|
||||
/// -- a <------
|
||||
/// -- -- b | |
|
||||
/// -- -- -- c |
|
||||
/// -- -- -- d |
|
||||
/// -- -- -- -- e
|
||||
/// -- f
|
||||
/// -- -- g
|
||||
/// -- -- -- h
|
||||
/// -- i <---
|
||||
/// -- -- j |
|
||||
/// -- -- -- k
|
||||
::arrow::Result<std::vector<int>> GetFieldIndices(
|
||||
const std::vector<int>& column_indices) const {
|
||||
const schema::GroupNode* group = descr->group_node();
|
||||
std::unordered_set<int> already_added;
|
||||
|
||||
std::vector<int> out;
|
||||
for (int column_idx : column_indices) {
|
||||
if (column_idx < 0 || column_idx >= descr->num_columns()) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
auto field_node = descr->GetColumnRoot(column_idx);
|
||||
auto field_idx = group->FieldIndex(*field_node);
|
||||
if (field_idx == -1) {
|
||||
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
|
||||
}
|
||||
|
||||
if (already_added.insert(field_idx).second) {
|
||||
out.push_back(field_idx);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
} // namespace parquet
|
||||
Reference in New Issue
Block a user