Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,152 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include "parquet/encryption/encryption.h"
#include "parquet/encryption/file_key_wrapper.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/platform.h"
namespace parquet::encryption {
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
ParquetCipher::AES_GCM_V1;
static constexpr bool kDefaultPlaintextFooter = false;
static constexpr bool kDefaultDoubleWrapping = true;
static constexpr double kDefaultCacheLifetimeSeconds = 600; // 10 minutes
static constexpr bool kDefaultInternalKeyMaterial = true;
static constexpr bool kDefaultUniformEncryption = false;
static constexpr int32_t kDefaultDataKeyLengthBits = 128;
struct PARQUET_EXPORT EncryptionConfiguration {
explicit EncryptionConfiguration(const std::string& footer_key)
: footer_key(footer_key) {}
/// ID of the master key for footer encryption/signing
std::string footer_key;
/// List of columns to encrypt, with column master key IDs (see HIVE-21848).
/// Format: "columnKeyID:colName,colName;columnKeyID:colName..."
/// Either
/// (1) column_keys must be set
/// or
/// (2) uniform_encryption must be set to true
/// If none of (1) and (2) are true, or if both are true, an exception will be
/// thrown.
std::string column_keys;
/// Encrypt footer and all columns with the same encryption key.
bool uniform_encryption = kDefaultUniformEncryption;
/// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1".
ParquetCipher::type encryption_algorithm = kDefaultEncryptionAlgorithm;
/// Write files with plaintext footer.
/// The default is false - files are written with encrypted footer.
bool plaintext_footer = kDefaultPlaintextFooter;
/// Use double wrapping - where data encryption keys (DEKs) are encrypted with key
/// encryption keys (KEKs), which in turn are encrypted with master keys.
/// The default is true. If set to false, use single wrapping - where DEKs are
/// encrypted directly with master keys.
bool double_wrapping = kDefaultDoubleWrapping;
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
/// objects).
/// The default is 600 (10 minutes).
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
/// Store key material inside Parquet file footers; this mode doesnt produce
/// additional files. By default, true. If set to false, key material is stored in
/// separate files in the same folder, which enables key rotation for immutable
/// Parquet files.
bool internal_key_material = kDefaultInternalKeyMaterial;
/// Length of data encryption keys (DEKs), randomly generated by parquet key
/// management tools. Can be 128, 192 or 256 bits.
/// The default is 128 bits.
int32_t data_key_length_bits = kDefaultDataKeyLengthBits;
};
struct PARQUET_EXPORT DecryptionConfiguration {
/// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
/// objects).
/// The default is 600 (10 minutes).
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
};
/// This is a core class, that translates the parameters of high level encryption (like
/// the names of encrypted columns, names of master keys, etc), into parameters of low
/// level encryption (like the key metadata, DEK, etc). A factory that produces the low
/// level FileEncryptionProperties and FileDecryptionProperties objects, from the high
/// level parameters.
class PARQUET_EXPORT CryptoFactory {
public:
/// a KmsClientFactory object must be registered via this method before calling any of
/// GetFileEncryptionProperties()/GetFileDecryptionProperties() methods.
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory);
/// Get the encryption properties for a Parquet file.
/// If external key material is used then a file system and path to the
/// parquet file must be provided.
std::shared_ptr<FileEncryptionProperties> GetFileEncryptionProperties(
const KmsConnectionConfig& kms_connection_config,
const EncryptionConfiguration& encryption_config, const std::string& file_path = "",
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
/// Get decryption properties for a Parquet file.
/// If external key material is used then a file system and path to the
/// parquet file must be provided.
std::shared_ptr<FileDecryptionProperties> GetFileDecryptionProperties(
const KmsConnectionConfig& kms_connection_config,
const DecryptionConfiguration& decryption_config, const std::string& file_path = "",
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
void RemoveCacheEntriesForToken(const std::string& access_token) {
key_toolkit_->RemoveCacheEntriesForToken(access_token);
}
void RemoveCacheEntriesForAllTokens() {
key_toolkit_->RemoveCacheEntriesForAllTokens();
}
/// Rotates master encryption keys for a Parquet file that uses external key material.
/// In single wrapping mode, data encryption keys are decrypted with the old master keys
/// and then re-encrypted with new master keys.
/// In double wrapping mode, key encryption keys are decrypted with the old master keys
/// and then re-encrypted with new master keys.
/// This relies on the KMS supporting versioning, such that the old master key is
/// used when unwrapping a key, and the latest version is used when wrapping a key.
void RotateMasterKeys(const KmsConnectionConfig& kms_connection_config,
const std::string& parquet_file_path,
const std::shared_ptr<::arrow::fs::FileSystem>& file_system,
bool double_wrapping = kDefaultDoubleWrapping,
double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds);
private:
ColumnPathToEncryptionPropertiesMap GetColumnEncryptionProperties(
int dek_length, const std::string& column_keys, FileKeyWrapper* key_wrapper);
/// Key utilities object for kms client initialization and cache control
std::shared_ptr<KeyToolkit> key_toolkit_ = std::make_shared<KeyToolkit>();
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,441 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cassert>
#include <map>
#include <memory>
#include <string>
#include <utility>
#include "arrow/util/secure_string.h"
#include "parquet/exception.h"
#include "parquet/schema.h"
#include "parquet/types.h"
namespace parquet {
static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
ParquetCipher::AES_GCM_V1;
static constexpr int32_t kMaximalAadMetadataLength = 256;
static constexpr bool kDefaultEncryptedFooter = true;
static constexpr bool kDefaultCheckSignature = true;
static constexpr bool kDefaultAllowPlaintextFiles = false;
static constexpr int32_t kAadFileUniqueLength = 8;
class ColumnDecryptionProperties;
using ColumnPathToDecryptionPropertiesMap =
std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
class ColumnEncryptionProperties;
using ColumnPathToEncryptionPropertiesMap =
std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
class PARQUET_EXPORT DecryptionKeyRetriever {
public:
/// \brief Retrieve a key.
virtual ::arrow::util::SecureString GetKey(const std::string& key_id) = 0;
virtual ~DecryptionKeyRetriever() {}
};
/// Simple integer key retriever
class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
public:
void PutKey(uint32_t key_id, ::arrow::util::SecureString key);
::arrow::util::SecureString GetKey(const std::string& key_id_string) override {
// key_id_string is string but for IntegerKeyIdRetriever it encodes
// a native-endian 32 bit unsigned integer key_id
uint32_t key_id;
assert(key_id_string.size() == sizeof(key_id));
memcpy(&key_id, key_id_string.data(), sizeof(key_id));
return key_map_.at(key_id);
}
private:
std::map<uint32_t, ::arrow::util::SecureString> key_map_;
};
// Simple string key retriever
class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
public:
void PutKey(std::string key_id, ::arrow::util::SecureString key);
::arrow::util::SecureString GetKey(const std::string& key_id) override;
private:
std::map<std::string, ::arrow::util::SecureString> key_map_;
};
class PARQUET_EXPORT HiddenColumnException : public ParquetException {
public:
explicit HiddenColumnException(const std::string& columnPath)
: ParquetException(columnPath.c_str()) {}
};
class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
public:
explicit KeyAccessDeniedException(const std::string& columnPath)
: ParquetException(columnPath.c_str()) {}
};
inline const uint8_t* str2bytes(const std::string& str) {
if (str.empty()) return NULLPTR;
char* cbytes = const_cast<char*>(str.c_str());
return reinterpret_cast<const uint8_t*>(cbytes);
}
inline ::arrow::util::span<const uint8_t> str2span(const std::string& str) {
if (str.empty()) {
return {};
}
return {reinterpret_cast<const uint8_t*>(str.data()), str.size()};
}
class PARQUET_EXPORT ColumnEncryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
/// Convenience builder for encrypted columns.
explicit Builder(std::string name) : Builder(std::move(name), true) {}
/// Convenience builder for encrypted columns.
explicit Builder(const schema::ColumnPath& path)
: Builder(path.ToDotString(), true) {}
/// Set a column-specific key.
/// If key is not set on an encrypted column, the column will
/// be encrypted with the footer key.
/// keyBytes Key length must be either 16, 24 or 32 bytes.
/// Caller is responsible for wiping out the input key array.
Builder* key(::arrow::util::SecureString column_key);
/// Set a key retrieval metadata.
/// use either key_metadata() or key_id(), not both
Builder* key_metadata(std::string key_metadata);
/// A convenience function to set key metadata using a string id.
/// Set a key retrieval metadata (converted from String).
/// use either key_metadata() or key_id(), not both
/// key_id will be converted to metadata (UTF-8 array).
Builder* key_id(std::string key_id);
std::shared_ptr<ColumnEncryptionProperties> build() {
return std::shared_ptr<ColumnEncryptionProperties>(
new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
}
private:
std::string column_path_;
bool encrypted_;
::arrow::util::SecureString key_;
std::string key_metadata_;
Builder(std::string path, bool encrypted)
: column_path_(std::move(path)), encrypted_(encrypted) {}
};
const std::string& column_path() const { return column_path_; }
bool is_encrypted() const { return encrypted_; }
bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
const ::arrow::util::SecureString& key() const { return key_; }
const std::string& key_metadata() const { return key_metadata_; }
private:
std::string column_path_;
bool encrypted_;
bool encrypted_with_footer_key_;
::arrow::util::SecureString key_;
std::string key_metadata_;
explicit ColumnEncryptionProperties(bool encrypted, std::string column_path,
::arrow::util::SecureString key,
std::string key_metadata);
};
class PARQUET_EXPORT ColumnDecryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
explicit Builder(std::string name) : column_path_(std::move(name)) {}
explicit Builder(const schema::ColumnPath& path) : Builder(path.ToDotString()) {}
/// Set an explicit column key. If applied on a file that contains
/// key metadata for this column the metadata will be ignored,
/// the column will be decrypted with this key.
/// key length must be either 16, 24 or 32 bytes.
Builder* key(::arrow::util::SecureString key);
std::shared_ptr<ColumnDecryptionProperties> build();
private:
std::string column_path_;
::arrow::util::SecureString key_;
};
const std::string& column_path() const { return column_path_; }
const ::arrow::util::SecureString& key() const { return key_; }
private:
std::string column_path_;
::arrow::util::SecureString key_;
/// This class is only required for setting explicit column decryption keys -
/// to override key retriever (or to provide keys when key metadata and/or
/// key retriever are not available)
explicit ColumnDecryptionProperties(std::string column_path,
::arrow::util::SecureString key);
};
class PARQUET_EXPORT AADPrefixVerifier {
public:
/// Verifies identity (AAD Prefix) of individual file,
/// or of file collection in a data set.
/// Throws exception if an AAD prefix is wrong.
/// In a data set, AAD Prefixes should be collected,
/// and then checked for missing files.
virtual void Verify(const std::string& aad_prefix) = 0;
virtual ~AADPrefixVerifier() {}
};
class PARQUET_EXPORT FileDecryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
Builder() {
check_plaintext_footer_integrity_ = kDefaultCheckSignature;
plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
}
/// Set an explicit footer key. If applied on a file that contains
/// footer key metadata the metadata will be ignored, the footer
/// will be decrypted/verified with this key.
/// If explicit key is not set, footer key will be fetched from
/// key retriever.
/// With explicit keys or AAD prefix, new encryption properties object must be
/// created for each encrypted file.
/// Explicit encryption keys (footer and column) are cloned.
/// Upon completion of file reading, the cloned encryption keys in the properties
/// will be wiped out (array values set to 0).
/// Caller is responsible for wiping out the input key array.
/// param footerKey Key length must be either 16, 24 or 32 bytes.
Builder* footer_key(::arrow::util::SecureString footer_key);
/// Set explicit column keys (decryption properties).
/// Its also possible to set a key retriever on this property object.
/// Upon file decryption, availability of explicit keys is checked before
/// invocation of the retriever callback.
/// If an explicit key is available for a footer or a column,
/// its key metadata will be ignored.
Builder* column_keys(
ColumnPathToDecryptionPropertiesMap column_decryption_properties);
/// Set a key retriever callback. Its also possible to
/// set explicit footer or column keys on this file property object.
/// Upon file decryption, availability of explicit keys is checked before
/// invocation of the retriever callback.
/// If an explicit key is available for a footer or a column,
/// its key metadata will be ignored.
Builder* key_retriever(std::shared_ptr<DecryptionKeyRetriever> key_retriever);
/// Skip integrity verification of plaintext footers.
/// If not called, integrity of plaintext footers will be checked in runtime,
/// and an exception will be thrown in the following situations:
/// - footer signing key is not available
/// (not passed, or not found by key retriever)
/// - footer content and signature don't match
Builder* disable_footer_signature_verification() {
check_plaintext_footer_integrity_ = false;
return this;
}
/// Explicitly supply the file AAD prefix.
/// A must when a prefix is used for file encryption, but not stored in file.
/// If AAD prefix is stored in file, it will be compared to the explicitly
/// supplied value and an exception will be thrown if they differ.
Builder* aad_prefix(std::string aad_prefix);
/// Set callback for verification of AAD Prefixes stored in file.
Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
/// By default, reading plaintext (unencrypted) files is not
/// allowed when using a decryptor
/// - in order to detect files that were not encrypted by mistake.
/// However, the default behavior can be overridden by calling this method.
/// The caller should use then a different method to ensure encryption
/// of files with sensitive data.
Builder* plaintext_files_allowed() {
plaintext_files_allowed_ = true;
return this;
}
std::shared_ptr<FileDecryptionProperties> build() {
return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
}
private:
::arrow::util::SecureString footer_key_;
std::string aad_prefix_;
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
bool check_plaintext_footer_integrity_;
bool plaintext_files_allowed_;
};
const ::arrow::util::SecureString& column_key(const std::string& column_path) const;
const ::arrow::util::SecureString& footer_key() const { return footer_key_; }
const std::string& aad_prefix() const { return aad_prefix_; }
const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
return key_retriever_;
}
bool check_plaintext_footer_integrity() const {
return check_plaintext_footer_integrity_;
}
bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
return aad_prefix_verifier_;
}
private:
::arrow::util::SecureString footer_key_;
std::string aad_prefix_;
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
bool check_plaintext_footer_integrity_;
bool plaintext_files_allowed_;
FileDecryptionProperties(
::arrow::util::SecureString footer_key,
std::shared_ptr<DecryptionKeyRetriever> key_retriever,
bool check_plaintext_footer_integrity, std::string aad_prefix,
std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
ColumnPathToDecryptionPropertiesMap column_decryption_properties,
bool plaintext_files_allowed);
};
class PARQUET_EXPORT FileEncryptionProperties {
public:
class PARQUET_EXPORT Builder {
public:
explicit Builder(::arrow::util::SecureString footer_key)
: parquet_cipher_(kDefaultEncryptionAlgorithm),
encrypted_footer_(kDefaultEncryptedFooter),
footer_key_(std::move(footer_key)) {
store_aad_prefix_in_file_ = false;
}
/// Create files with plaintext footer.
/// If not called, the files will be created with encrypted footer (default).
Builder* set_plaintext_footer() {
encrypted_footer_ = false;
return this;
}
/// Set encryption algorithm.
/// If not called, files will be encrypted with AES_GCM_V1 (default).
Builder* algorithm(ParquetCipher::type parquet_cipher) {
parquet_cipher_ = parquet_cipher;
return this;
}
/// Set a key retrieval metadata (converted from String).
/// use either footer_key_metadata or footer_key_id, not both.
Builder* footer_key_id(std::string key_id);
/// Set a key retrieval metadata.
/// use either footer_key_metadata or footer_key_id, not both.
Builder* footer_key_metadata(std::string footer_key_metadata);
/// Set the file AAD Prefix.
Builder* aad_prefix(std::string aad_prefix);
/// Skip storing AAD Prefix in file.
/// If not called, and if AAD Prefix is set, it will be stored.
Builder* disable_aad_prefix_storage();
/// Set the list of encrypted columns and their properties (keys etc).
/// If not called, all columns will be encrypted with the footer key.
/// If called, the file columns not in the list will be left unencrypted.
Builder* encrypted_columns(ColumnPathToEncryptionPropertiesMap encrypted_columns);
std::shared_ptr<FileEncryptionProperties> build() {
return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
}
private:
ParquetCipher::type parquet_cipher_;
bool encrypted_footer_;
::arrow::util::SecureString footer_key_;
std::string footer_key_metadata_;
std::string aad_prefix_;
bool store_aad_prefix_in_file_;
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
};
bool encrypted_footer() const { return encrypted_footer_; }
EncryptionAlgorithm algorithm() const { return algorithm_; }
const ::arrow::util::SecureString& footer_key() const { return footer_key_; }
const std::string& footer_key_metadata() const { return footer_key_metadata_; }
const std::string& file_aad() const { return file_aad_; }
std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
const std::string& column_path);
const ColumnPathToEncryptionPropertiesMap& encrypted_columns() const {
return encrypted_columns_;
}
private:
EncryptionAlgorithm algorithm_;
::arrow::util::SecureString footer_key_;
std::string footer_key_metadata_;
bool encrypted_footer_;
std::string file_aad_;
std::string aad_prefix_;
bool store_aad_prefix_in_file_;
ColumnPathToEncryptionPropertiesMap encrypted_columns_;
FileEncryptionProperties(ParquetCipher::type cipher,
::arrow::util::SecureString footer_key,
std::string footer_key_metadata, bool encrypted_footer,
std::string aad_prefix, bool store_aad_prefix_in_file,
ColumnPathToEncryptionPropertiesMap encrypted_columns);
};
} // namespace parquet

View File

@@ -0,0 +1,57 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <set>
#include <string>
#include <unordered_map>
#include "arrow/filesystem/filesystem.h"
#include "parquet/platform.h"
namespace parquet::encryption {
/// Stores encryption key material outside the Parquet file, for example in a separate
/// small file in the same folder. This is important for “key rotation”, when MEKs have to
/// be changed (if compromised; or periodically, just in case) - without modifying the
/// Parquet files (often immutable).
class PARQUET_EXPORT FileKeyMaterialStore {
public:
/// Add key material for one encryption key.
virtual void AddKeyMaterial(std::string key_id_in_file, std::string key_material) = 0;
/// Get key material
virtual std::string GetKeyMaterial(std::string key_id_in_file) = 0;
/// After key material was added for all keys in the given Parquet file,
/// save material in persistent store.
virtual void SaveMaterial() = 0;
/// Remove key material from persistent store. Used in key rotation.
virtual void RemoveMaterial() = 0;
/// Move key material to another store. Used in key rotation.
virtual void MoveMaterialTo(std::shared_ptr<FileKeyMaterialStore> target_key_store) = 0;
/// Returns the Set of all key IDs in this store (for the given Parquet file)
virtual std::vector<std::string> GetKeyIDSet() = 0;
virtual ~FileKeyMaterialStore() {}
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,96 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/concurrent_map.h"
#include "arrow/util/secure_string.h"
#include "parquet/encryption/encryption.h"
#include "parquet/encryption/file_system_key_material_store.h"
#include "parquet/encryption/key_material.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/key_toolkit_internal.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet::encryption {
// This class will retrieve the key from "key metadata", following these steps:
// 1. Parse "key metadata" (see structure in KeyMetadata class).
// 2. Retrieve "key material" which can be stored inside or outside "key metadata".
// 3. Unwrap the "data encryption key" from "key material". There are 2 modes:
// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master
// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted
// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above
// "key encryption key"
class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever {
public:
/// key_toolkit and kms_connection_config is to get KmsClient from cache or create
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
/// KmsClient in the cache.
/// If the file uses external key material then the Parquet file path and file
/// system must be specified.
FileKeyUnwrapper(std::shared_ptr<KeyToolkit> key_toolkit,
const KmsConnectionConfig& kms_connection_config,
double cache_lifetime_seconds, const std::string& file_path = "",
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
/// Constructor overload that takes a raw pointer to the KeyToolkit
FileKeyUnwrapper(KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
double cache_lifetime_seconds, const std::string& file_path = "",
const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
/// Constructor overload that takes a raw pointer to the KeyToolkit and
/// accepts an existing key_material_store rather than using
/// the file path and file system to create one when needed.
FileKeyUnwrapper(KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
double cache_lifetime_seconds,
std::shared_ptr<FileKeyMaterialStore> key_material_store);
/// Get the data key from key metadata
::arrow::util::SecureString GetKey(const std::string& key_metadata_bytes) override;
/// Get the data key along with the master key id from key material
KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material);
private:
FileKeyUnwrapper(std::shared_ptr<KeyToolkit> key_toolkit_owner, KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
double cache_lifetime_seconds,
std::shared_ptr<FileKeyMaterialStore> key_material_store,
const std::string& file_path,
const std::shared_ptr<::arrow::fs::FileSystem>& file_system);
std::shared_ptr<KmsClient> GetKmsClientFromConfigOrKeyMaterial(
const KeyMaterial& key_material);
/// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, ::arrow::util::SecureString>>
kek_per_kek_id_;
std::shared_ptr<KeyToolkit> key_toolkit_owner_;
KeyToolkit* key_toolkit_;
KmsConnectionConfig kms_connection_config_;
const double cache_entry_lifetime_seconds_;
std::shared_ptr<FileKeyMaterialStore> key_material_store_;
const std::string file_path_;
std::shared_ptr<::arrow::fs::FileSystem> file_system_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,84 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/file_key_material_store.h"
#include "parquet/encryption/key_encryption_key.h"
#include "parquet/encryption/key_toolkit.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet::encryption {
// This class will generate "key metadata" from "data encryption key" and "master key",
// following these steps:
// 1. Wrap "data encryption key". There are 2 modes:
// 1.1. single wrapping: encrypt "data encryption key" directly with "master encryption
// key"
// 1.2. double wrapping: 2 steps:
// 1.2.1. "key encryption key" is randomized (see KeyEncryptionKey class)
// 1.2.2. "data encryption key" is encrypted with the above "key encryption key"
// 2. Create "key material" (see structure in KeyMaterial class)
// 3. Create "key metadata" with "key material" inside or a reference to outside "key
// material" (see structure in KeyMetadata class).
class PARQUET_EXPORT FileKeyWrapper {
public:
static constexpr int kKeyEncryptionKeyLength = 16;
static constexpr int kKeyEncryptionKeyIdLength = 16;
/// key_toolkit and kms_connection_config is to get KmsClient from the cache or create
/// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
/// KmsClient in the cache. key_material_store is to store "key material" outside
/// parquet file, NULL if "key material" is stored inside parquet file.
FileKeyWrapper(KeyToolkit* key_toolkit,
const KmsConnectionConfig& kms_connection_config,
std::shared_ptr<FileKeyMaterialStore> key_material_store,
double cache_entry_lifetime_seconds, bool double_wrapping);
/// Creates key_metadata field for a given data key, via wrapping the key with the
/// master key.
/// When external key material is used, an identifier is usually generated automatically
/// but may be specified explicitly to support key rotation,
/// which requires keeping the same identifiers.
std::string GetEncryptionKeyMetadata(const ::arrow::util::SecureString& data_key,
const std::string& master_key_id,
bool is_footer_key,
std::string key_id_in_file = "");
private:
KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id);
/// A map of Master Encryption Key ID -> KeyEncryptionKey, for the current token
std::shared_ptr<::arrow::util::ConcurrentMap<std::string, KeyEncryptionKey>>
kek_per_master_key_id_;
std::shared_ptr<KmsClient> kms_client_;
KmsConnectionConfig kms_connection_config_;
std::shared_ptr<FileKeyMaterialStore> key_material_store_;
const double cache_entry_lifetime_seconds_;
const bool double_wrapping_;
uint16_t key_counter_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,89 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <set>
#include <string>
#include <unordered_map>
#include "arrow/filesystem/filesystem.h"
#include "parquet/encryption/file_key_material_store.h"
namespace parquet::encryption {
/// A FileKeyMaterialStore that stores key material in a file system file in the same
/// folder as the Parquet file.
class PARQUET_EXPORT FileSystemKeyMaterialStore : public FileKeyMaterialStore {
public:
static constexpr const char kKeyMaterialFilePrefix[] = "_KEY_MATERIAL_FOR_";
static constexpr const char kTempFilePrefix[] = "_TMP";
static constexpr const char kKeyMaterialFileSuffix[] = ".json";
FileSystemKeyMaterialStore() {}
FileSystemKeyMaterialStore(std::string key_material_file_path,
std::shared_ptr<::arrow::fs::FileSystem> file_system);
/// Creates a new file system key material store for a parquet file.
/// When use_tmp_prefix is true, files are saved with an extra _TMP prefix so they don't
/// conflict with existing external material files. This is useful during key rotation
/// so that temporary key material files can be created while using the existing key
/// material, before moving the key material to the non-temporary location.
static std::shared_ptr<FileSystemKeyMaterialStore> Make(
std::string parquet_file_path, std::shared_ptr<::arrow::fs::FileSystem> file_system,
bool use_tmp_prefix);
/// Add key material for one encryption key.
void AddKeyMaterial(std::string key_id_in_file, std::string key_material) {
key_material_map_.emplace(std::move(key_id_in_file), std::move(key_material));
}
/// Get key material
std::string GetKeyMaterial(std::string key_id_in_file) {
if (key_material_map_.empty()) {
LoadKeyMaterialMap();
}
auto found = key_material_map_.find(key_id_in_file);
return found->second;
}
/// After key material was added for all keys in the given Parquet file,
/// save material in persistent store.
void SaveMaterial();
/// Remove key material from persistent store. Used in key rotation.
void RemoveMaterial();
/// Move key material to another store. Used in key rotation.
void MoveMaterialTo(std::shared_ptr<FileKeyMaterialStore> target_key_store);
/// Returns the Set of all key IDs in this store (for the given Parquet file)
std::vector<std::string> GetKeyIDSet();
private:
std::string GetStorageFilePath() { return key_material_file_path_; }
std::string BuildKeyMaterialMapJson();
void LoadKeyMaterialMap();
std::string key_material_file_path_;
std::shared_ptr<::arrow::fs::FileSystem> file_system_;
/// Maps ID of a key in Parquet file and key material
std::unordered_map<std::string, std::string> key_material_map_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,58 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <vector>
#include "arrow/util/base64.h"
#include "arrow/util/secure_string.h"
namespace parquet::encryption {
// In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key
// encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK).
// In a writer process, a random KEK is generated for each MEK ID, and cached in a <MEK-ID
// : KEK> map. This allows to perform an interaction with a KMS server only once for each
// MEK, in order to wrap its KEK. "Data encryption key" (DEK) wrapping is performed
// locally, and does not involve an interaction with a KMS server.
class KeyEncryptionKey {
public:
KeyEncryptionKey(::arrow::util::SecureString kek_bytes, std::string kek_id,
std::string encoded_wrapped_kek)
: kek_bytes_(std::move(kek_bytes)),
kek_id_(std::move(kek_id)),
encoded_kek_id_(::arrow::util::base64_encode(kek_id_)),
encoded_wrapped_kek_(std::move(encoded_wrapped_kek)) {}
const ::arrow::util::SecureString& kek_bytes() const { return kek_bytes_; }
const std::string& kek_id() const { return kek_id_; }
const std::string& encoded_kek_id() const { return encoded_kek_id_; }
const std::string& encoded_wrapped_kek() const { return encoded_wrapped_kek_; }
private:
::arrow::util::SecureString kek_bytes_;
std::string kek_id_;
std::string encoded_kek_id_;
std::string encoded_wrapped_kek_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,129 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "parquet/platform.h"
namespace arrow {
namespace json {
namespace internal {
class ObjectParser;
} // namespace internal
} // namespace json
} // namespace arrow
namespace parquet::encryption {
// KeyMaterial class represents the "key material", keeping the information that allows
// readers to recover an encryption key (see description of the KeyMetadata class). The
// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a
// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key
// material is generated by encrypting the "data encryption key" (DEK) by a "master key".
// In the double wrapping mode, the key material is generated by encrypting the DEK by a
// "key encryption key" (KEK), that in turn is encrypted by a "master key".
//
// Key material is kept in a flat json object, with the following fields:
// 1. "keyMaterialType" - a String, with the type of key material. In the current
// version, only one value is allowed - "PKMT1" (stands
// for "parquet key management tools, version 1"). For external key material storage,
// this field is written in both "key metadata" and "key material" jsons. For internal
// key material storage, this field is written only once in the common json.
// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer
// key, and keeps additional information (such as
// KMS instance ID and URL). If false, means that the material belongs to a column
// key.
// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key
// material.
// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key
// material.
// 5. "masterKeyID" - a String, with the ID of the master key used to generate the
// material.
// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding).
// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in
// double wrapping mode.
// If false - in single wrapping mode.
// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the
// material. Written only in double wrapping mode.
// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in
// double wrapping mode.
class PARQUET_EXPORT KeyMaterial {
public:
// these fields are defined in a specification and should never be changed
static constexpr const char kKeyMaterialTypeField[] = "keyMaterialType";
static constexpr const char kKeyMaterialType1[] = "PKMT1";
static constexpr const char kFooterKeyIdInFile[] = "footerKey";
static constexpr const char kColumnKeyIdInFilePrefix[] = "columnKey";
static constexpr const char kIsFooterKeyField[] = "isFooterKey";
static constexpr const char kDoubleWrappingField[] = "doubleWrapping";
static constexpr const char kKmsInstanceIdField[] = "kmsInstanceID";
static constexpr const char kKmsInstanceUrlField[] = "kmsInstanceURL";
static constexpr const char kMasterKeyIdField[] = "masterKeyID";
static constexpr const char kWrappedDataEncryptionKeyField[] = "wrappedDEK";
static constexpr const char kKeyEncryptionKeyIdField[] = "keyEncryptionKeyID";
static constexpr const char kWrappedKeyEncryptionKeyField[] = "wrappedKEK";
public:
KeyMaterial() = default;
static KeyMaterial Parse(const std::string& key_material_string);
static KeyMaterial Parse(
const ::arrow::json::internal::ObjectParser* key_material_json);
/// This method returns a json string that will be stored either inside a parquet file
/// or in a key material store outside the parquet file.
static std::string SerializeToJson(bool is_footer_key,
const std::string& kms_instance_id,
const std::string& kms_instance_url,
const std::string& master_key_id,
bool is_double_wrapped, const std::string& kek_id,
const std::string& encoded_wrapped_kek,
const std::string& encoded_wrapped_dek,
bool is_internal_storage);
bool is_footer_key() const { return is_footer_key_; }
bool is_double_wrapped() const { return is_double_wrapped_; }
const std::string& master_key_id() const { return master_key_id_; }
const std::string& wrapped_dek() const { return encoded_wrapped_dek_; }
const std::string& kek_id() const { return kek_id_; }
const std::string& wrapped_kek() const { return encoded_wrapped_kek_; }
const std::string& kms_instance_id() const { return kms_instance_id_; }
const std::string& kms_instance_url() const { return kms_instance_url_; }
private:
KeyMaterial(bool is_footer_key, const std::string& kms_instance_id,
const std::string& kms_instance_url, const std::string& master_key_id,
bool is_double_wrapped, const std::string& kek_id,
const std::string& encoded_wrapped_kek,
const std::string& encoded_wrapped_dek);
bool is_footer_key_;
std::string kms_instance_id_;
std::string kms_instance_url_;
std::string master_key_id_;
bool is_double_wrapped_;
std::string kek_id_;
std::string encoded_wrapped_kek_;
std::string encoded_wrapped_dek_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,91 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include <variant>
#include "parquet/encryption/key_material.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
namespace parquet::encryption {
// Parquet encryption specification defines "key metadata" as an arbitrary byte array,
// generated by file writers for each encryption key, and passed to the low level API for
// storage in the file footer. The "key metadata" field is made available to file readers
// to enable recovery of the key. This interface can be utilized for implementation
// of any key management scheme.
//
// The keytools package (PARQUET-1373) implements one approach, of many possible, to key
// management and to generation of the "key metadata" fields. This approach, based on the
// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual
// material, required to recover a key, in a "key material" object (see the KeyMaterial
// class for details). This class is implemented to support version 1 of the parquet key
// management tools specification.
//
// KeyMetadata writes (and reads) the "key metadata" field as a flat json object,
// with the following fields:
// 1. "keyMaterialType" - a String, with the type of key material.
// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the
// "key metadata" field. If false, "key material" is kept externally (outside Parquet
// files) - in this case, "key metadata" keeps a reference to the external "key material".
// 3. "keyReference" - a String, with the reference to the external "key material".
// Written only if internalStorage is false.
//
// If internalStorage is true, "key material" is a part of "key metadata", and the json
// keeps additional fields, described in the KeyMaterial class.
class PARQUET_EXPORT KeyMetadata {
public:
static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage";
static constexpr const char kKeyReferenceField[] = "keyReference";
/// key_metadata_bytes is the key metadata field stored in the parquet file,
/// in the serialized json object format.
static KeyMetadata Parse(const std::string& key_metadata_bytes);
static std::string CreateSerializedForExternalMaterial(
const std::string& key_reference);
bool key_material_stored_internally() const { return is_internal_storage_; }
const KeyMaterial& key_material() const {
if (!is_internal_storage_) {
throw ParquetException("key material is stored externally.");
}
return ::std::get<KeyMaterial>(key_material_or_reference_);
}
const std::string& key_reference() const {
if (is_internal_storage_) {
throw ParquetException("key material is stored internally.");
}
return ::std::get<std::string>(key_material_or_reference_);
}
private:
explicit KeyMetadata(const KeyMaterial& key_material);
explicit KeyMetadata(const std::string& key_reference);
bool is_internal_storage_;
/// If is_internal_storage_ is true, KeyMaterial is set,
/// else a string referencing to an outside "key material" is set.
::std::variant<KeyMaterial, std::string> key_material_or_reference_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,106 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include "parquet/encryption/key_encryption_key.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/encryption/two_level_cache_with_expiration.h"
#include "parquet/platform.h"
namespace parquet::encryption {
static constexpr uint64_t kCacheCleanPeriodForKeyRotation = 60 * 60; // 1 hour
// KeyToolkit is a utility that keeps various tools for key management (such as key
// rotation, kms client instantiation, cache control, etc), plus a number of auxiliary
// classes for internal use.
class PARQUET_EXPORT KeyToolkit {
public:
KeyToolkit() { last_cache_clean_for_key_rotation_time_ = {}; }
/// KMS client two level cache: token -> KMSInstanceId -> KmsClient
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>>& kms_client_cache_per_token() {
return kms_client_cache_;
}
/// Key encryption key two level cache for wrapping: token -> MasterEncryptionKeyId ->
/// KeyEncryptionKey
TwoLevelCacheWithExpiration<KeyEncryptionKey>& kek_write_cache_per_token() {
return key_encryption_key_write_cache_;
}
/// Key encryption key two level cache for unwrapping: token -> KeyEncryptionKeyId ->
/// KeyEncryptionKeyBytes
TwoLevelCacheWithExpiration<::arrow::util::SecureString>& kek_read_cache_per_token() {
return key_encryption_key_read_cache_;
}
std::shared_ptr<KmsClient> GetKmsClient(
const KmsConnectionConfig& kms_connection_config, double cache_entry_lifetime_ms);
/// Flush any caches that are tied to the (compromised) access_token
void RemoveCacheEntriesForToken(const std::string& access_token);
void RemoveCacheEntriesForAllTokens();
void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory) {
if (kms_client_factory_ != NULLPTR) {
throw ParquetException("KMS client factory has already been registered.");
}
kms_client_factory_ = std::move(kms_client_factory);
}
/// Key rotation. In the single wrapping mode, decrypts data keys with old master keys,
/// then encrypts them with new master keys. In the double wrapping mode, decrypts KEKs
/// (key encryption keys) with old master keys, generates new KEKs and encrypts them
/// with new master keys. Works only if key material is not stored internally in file
/// footers. Not supported in local key wrapping mode. Method can be run by multiple
/// threads, but each thread must work on different files.
void RotateMasterKeys(const KmsConnectionConfig& kms_connection_config,
const std::string& parquet_file_path,
const std::shared_ptr<::arrow::fs::FileSystem>& file_system,
bool double_wrapping, double cache_lifetime_seconds);
private:
TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>> kms_client_cache_;
TwoLevelCacheWithExpiration<KeyEncryptionKey> key_encryption_key_write_cache_;
TwoLevelCacheWithExpiration<::arrow::util::SecureString> key_encryption_key_read_cache_;
std::shared_ptr<KmsClientFactory> kms_client_factory_;
mutable ::arrow::util::Mutex last_cache_clean_for_key_rotation_time_mutex_;
internal::TimePoint last_cache_clean_for_key_rotation_time_;
};
// "data encryption key" and "master key identifier" are paired together as output when
// parsing from "key material"
class PARQUET_EXPORT KeyWithMasterId {
public:
KeyWithMasterId(::arrow::util::SecureString key_bytes, std::string master_id)
: key_bytes_(std::move(key_bytes)), master_id_(std::move(master_id)) {}
const ::arrow::util::SecureString& data_key() const { return key_bytes_; }
const std::string& master_id() const { return master_id_; }
private:
::arrow::util::SecureString key_bytes_;
std::string master_id_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,97 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include "arrow/util/mutex.h"
#include "arrow/util/secure_string.h"
#include "parquet/exception.h"
#include "parquet/platform.h"
namespace parquet::encryption {
/// This class wraps the key access token of a KMS server. If your token changes over
/// time, you should keep the reference to the KeyAccessToken object and call Refresh()
/// method every time you have a new token.
class PARQUET_EXPORT KeyAccessToken {
public:
KeyAccessToken() = default;
explicit KeyAccessToken(const std::string value) : value_(value) {}
void Refresh(const std::string& new_value) {
auto lock = mutex_.Lock();
value_ = new_value;
}
const std::string& value() const {
auto lock = mutex_.Lock();
return value_;
}
private:
std::string value_;
mutable ::arrow::util::Mutex mutex_;
};
struct PARQUET_EXPORT KmsConnectionConfig {
std::string kms_instance_id;
std::string kms_instance_url;
/// If the access token is changed in the future, you should keep a reference to
/// this object and call Refresh() on it whenever there is a new access token.
std::shared_ptr<KeyAccessToken> refreshable_key_access_token;
std::unordered_map<std::string, std::string> custom_kms_conf;
KmsConnectionConfig();
const std::string& key_access_token() const {
if (refreshable_key_access_token == NULLPTR ||
refreshable_key_access_token->value().empty()) {
throw ParquetException("key access token is not set!");
}
return refreshable_key_access_token->value();
}
void SetDefaultIfEmpty();
};
class PARQUET_EXPORT KmsClient {
public:
static constexpr const char kKmsInstanceIdDefault[] = "DEFAULT";
static constexpr const char kKmsInstanceUrlDefault[] = "DEFAULT";
static constexpr const char kKeyAccessTokenDefault[] = "DEFAULT";
/// \brief Wraps a key.
///
/// Encrypts it with the master key, encodes the result
/// and potentially adds a KMS-specific metadata.
virtual std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
const std::string& master_key_identifier) = 0;
/// \brief Decrypts (unwraps) a key with the master key.
virtual ::arrow::util::SecureString UnwrapKey(
const std::string& wrapped_key, const std::string& master_key_identifier) = 0;
virtual ~KmsClient() {}
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,38 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet::encryption {
class PARQUET_EXPORT KmsClientFactory {
public:
explicit KmsClientFactory(bool wrap_locally = false) : wrap_locally_(wrap_locally) {}
virtual ~KmsClientFactory() = default;
virtual std::shared_ptr<KmsClient> CreateKmsClient(
const KmsConnectionConfig& kms_connection_config) = 0;
protected:
bool wrap_locally_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,95 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <unordered_map>
#include <vector>
#include "arrow/util/concurrent_map.h"
#include "parquet/encryption/kms_client.h"
#include "parquet/platform.h"
namespace parquet::encryption {
/// This class supports local wrapping mode, master keys will be fetched from the KMS
/// server and used to encrypt other keys (data encryption keys or key encryption keys).
class PARQUET_EXPORT LocalWrapKmsClient : public KmsClient {
public:
static constexpr const char kLocalWrapNoKeyVersion[] = "NO_VERSION";
explicit LocalWrapKmsClient(const KmsConnectionConfig& kms_connection_config);
std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
const std::string& master_key_identifier) override;
::arrow::util::SecureString UnwrapKey(
const std::string& wrapped_key, const std::string& master_key_identifier) override;
protected:
/// Get master key from the remote KMS server.
/// Note: this function might be called by multiple threads
virtual const ::arrow::util::SecureString& GetMasterKeyFromServer(
const std::string& master_key_identifier) = 0;
private:
/// KMS systems wrap keys by encrypting them by master keys, and attaching additional
/// information (such as the version number of the masker key) to the result of
/// encryption. The master key version is required in key rotation. Currently, the
/// local wrapping mode does not support key rotation (because not all KMS systems allow
/// to fetch a master key by its ID and version number). Still, the local wrapping mode
/// adds a placeholder for the master key version, that will enable support for key
/// rotation in this mode in the future, with appropriate KMS systems. This will also
/// enable backward compatibility, where future readers will be able to extract master
/// key version in the files written by the current code.
///
/// LocalKeyWrap class writes (and reads) the "key wrap" as a flat json with the
/// following fields:
/// 1. "masterKeyVersion" - a String, with the master key version. In the current
/// version, only one value is allowed - "NO_VERSION".
/// 2. "encryptedKey" - a String, with the key encrypted by the master key
/// (base64-encoded).
class LocalKeyWrap {
public:
static constexpr const char kLocalWrapKeyVersionField[] = "masterKeyVersion";
static constexpr const char kLocalWrapEncryptedKeyField[] = "encryptedKey";
LocalKeyWrap(std::string master_key_version, std::string encrypted_encoded_key);
static std::string CreateSerialized(const std::string& encrypted_encoded_key);
static LocalKeyWrap Parse(const std::string& wrapped_key);
const std::string& master_key_version() const { return master_key_version_; }
const std::string& encrypted_encoded_key() const { return encrypted_encoded_key_; }
private:
std::string encrypted_encoded_key_;
std::string master_key_version_;
};
const ::arrow::util::SecureString& GetKeyFromServer(const std::string& key_identifier);
protected:
KmsConnectionConfig kms_connection_config_;
::arrow::util::ConcurrentMap<std::string, ::arrow::util::SecureString>
master_key_cache_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,135 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This module defines an abstract interface for iterating through pages in a
// Parquet column chunk within a row group. It could be extended in the future
// to iterate through all data pages in all chunks in a file.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <gtest/gtest.h>
#include "arrow/filesystem/filesystem.h"
#include "arrow/filesystem/localfs.h"
#include "arrow/status.h"
#include "arrow/util/io_util.h"
#include "arrow/util/secure_string.h"
#include "parquet/encryption/encryption.h"
#include "parquet/test_util.h"
namespace parquet {
class ParquetFileReader;
namespace encryption::test {
using ::arrow::internal::TemporaryDir;
using ::arrow::util::SecureString;
constexpr int kFixedLength = 10;
const SecureString kFooterEncryptionKey("0123456789012345");
const SecureString kColumnEncryptionKey1("1234567890123450");
const SecureString kColumnEncryptionKey2("1234567890123451");
const char kFileName[] = "tester";
// Get the path of file inside parquet test data directory
std::string data_file(const char* file);
// A temporary directory that contains the encrypted files generated in the tests.
extern std::unique_ptr<TemporaryDir> temp_dir;
inline ::arrow::Result<std::unique_ptr<TemporaryDir>> temp_data_dir() {
return TemporaryDir::Make("parquet-encryption-test-");
}
const char kDoubleFieldName[] = "double_field";
const char kFloatFieldName[] = "float_field";
const char kBooleanFieldName[] = "boolean_field";
const char kInt32FieldName[] = "int32_field";
const char kInt64FieldName[] = "int64_field";
const char kInt96FieldName[] = "int96_field";
const char kByteArrayFieldName[] = "ba_field";
const char kFixedLenByteArrayFieldName[] = "flba_field";
const char kFooterMasterKey[] = "0123456789012345";
const char kFooterMasterKeyId[] = "kf";
const char* const kColumnMasterKeys[] = {"1234567890123450", "1234567890123451",
"1234567890123452", "1234567890123453",
"1234567890123454", "1234567890123455"};
const char* const kColumnMasterKeyIds[] = {"kc1", "kc2", "kc3", "kc4", "kc5", "kc6"};
// New master key values used to simulate key rotation
const char kNewFooterMasterKey[] = "9123456789012345";
const char* const kNewColumnMasterKeys[] = {"9234567890123450", "9234567890123451",
"9234567890123452", "9234567890123453",
"9234567890123454", "9234567890123455"};
// The result of this function will be used to set into TestOnlyInMemoryKmsClientFactory
// as the key mapping to look at.
std::unordered_map<std::string, SecureString> BuildKeyMap(const char* const* column_ids,
const char* const* column_keys,
const char* footer_id,
const char* footer_key);
// The result of this function will be used to set into EncryptionConfiguration
// as column keys.
std::string BuildColumnKeyMapping();
// FileEncryptor and FileDecryptor are helper classes to write/read an encrypted parquet
// file corresponding to each pair of FileEncryptionProperties/FileDecryptionProperties.
// FileEncryptor writes the file with fixed data values and FileDecryptor reads the file
// and verify the correctness of data values.
class FileEncryptor {
public:
FileEncryptor();
void EncryptFile(
std::string file,
std::shared_ptr<parquet::FileEncryptionProperties> encryption_configurations);
private:
std::shared_ptr<schema::GroupNode> SetupEncryptionSchema();
int num_rowgroups_ = 5;
int rows_per_rowgroup_ = 50;
std::shared_ptr<schema::GroupNode> schema_;
};
class FileDecryptor {
public:
void DecryptFile(
const std::string& file_name,
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
void DecryptPageIndex(
const std::string& file_name,
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
private:
void CheckFile(
parquet::ParquetFileReader* file_reader,
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
void CheckPageIndex(
parquet::ParquetFileReader* file_reader,
const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
};
} // namespace encryption::test
} // namespace parquet

View File

@@ -0,0 +1,101 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <unordered_map>
#include "arrow/util/base64.h"
#include "parquet/encryption/kms_client_factory.h"
#include "parquet/encryption/local_wrap_kms_client.h"
#include "parquet/platform.h"
namespace parquet::encryption {
// This is a mock class, built for testing only. Don't use it as an example of
// LocalWrapKmsClient implementation.
class TestOnlyLocalWrapInMemoryKms : public LocalWrapKmsClient {
public:
explicit TestOnlyLocalWrapInMemoryKms(const KmsConnectionConfig& kms_connection_config);
static void InitializeMasterKeys(
const std::unordered_map<std::string, ::arrow::util::SecureString>&
master_keys_map);
protected:
const ::arrow::util::SecureString& GetMasterKeyFromServer(
const std::string& master_key_identifier) override;
private:
static std::unordered_map<std::string, ::arrow::util::SecureString> master_key_map_;
};
// This is a mock class, built for testing only. Don't use it as an example of KmsClient
// implementation.
class TestOnlyInServerWrapKms : public KmsClient {
public:
static void InitializeMasterKeys(
const std::unordered_map<std::string, ::arrow::util::SecureString>&
master_keys_map);
std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
const std::string& master_key_identifier) override;
::arrow::util::SecureString UnwrapKey(
const std::string& wrapped_key, const std::string& master_key_identifier) override;
static void StartKeyRotation(
const std::unordered_map<std::string, ::arrow::util::SecureString>&
new_master_keys_map);
static void FinishKeyRotation();
private:
::arrow::util::SecureString GetMasterKeyFromServer(
const std::string& master_key_identifier);
// Different wrapping and unwrapping key maps to imitate versioning
// and support key rotation.
static std::unordered_map<std::string, ::arrow::util::SecureString>
unwrapping_master_key_map_;
static std::unordered_map<std::string, ::arrow::util::SecureString>
wrapping_master_key_map_;
};
// This is a mock class, built for testing only. Don't use it as an example of
// KmsClientFactory implementation.
class TestOnlyInMemoryKmsClientFactory : public KmsClientFactory {
public:
TestOnlyInMemoryKmsClientFactory(
bool wrap_locally,
const std::unordered_map<std::string, ::arrow::util::SecureString>& master_keys_map)
: KmsClientFactory(wrap_locally) {
TestOnlyLocalWrapInMemoryKms::InitializeMasterKeys(master_keys_map);
TestOnlyInServerWrapKms::InitializeMasterKeys(master_keys_map);
}
std::shared_ptr<KmsClient> CreateKmsClient(
const KmsConnectionConfig& kms_connection_config) {
if (wrap_locally_) {
return std::make_shared<TestOnlyLocalWrapInMemoryKms>(kms_connection_config);
} else {
return std::make_shared<TestOnlyInServerWrapKms>();
}
}
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,149 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <chrono>
#include <unordered_map>
#include "arrow/util/concurrent_map.h"
#include "arrow/util/mutex.h"
namespace parquet::encryption {
using ::arrow::util::ConcurrentMap;
namespace internal {
using TimePoint =
std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<double>>;
inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); }
template <typename E>
class ExpiringCacheEntry {
public:
ExpiringCacheEntry() = default;
ExpiringCacheEntry(E cached_item, double expiration_interval_seconds)
: expiration_timestamp_(CurrentTimePoint() +
std::chrono::duration<double>(expiration_interval_seconds)),
cached_item_(std::move(cached_item)) {}
bool IsExpired() const {
const auto now = CurrentTimePoint();
return (now > expiration_timestamp_);
}
E cached_item() { return cached_item_; }
private:
const TimePoint expiration_timestamp_;
E cached_item_;
};
// This class is to avoid the below warning when compiling KeyToolkit class with VS2015
// warning C4503: decorated name length exceeded, name was truncated
template <typename V>
class ExpiringCacheMapEntry {
public:
ExpiringCacheMapEntry() = default;
explicit ExpiringCacheMapEntry(
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item,
double expiration_interval_seconds)
: map_cache_(cached_item, expiration_interval_seconds) {}
bool IsExpired() { return map_cache_.IsExpired(); }
std::shared_ptr<ConcurrentMap<std::string, V>> cached_item() {
return map_cache_.cached_item();
}
private:
// ConcurrentMap object may be accessed and modified at many places at the same time,
// from multiple threads, or even removed from cache.
ExpiringCacheEntry<std::shared_ptr<ConcurrentMap<std::string, V>>> map_cache_;
};
} // namespace internal
// Two-level cache with expiration of internal caches according to token lifetime.
// External cache is per token, internal is per string key.
// Wrapper class around:
// std::unordered_map<std::string,
// internal::ExpiringCacheEntry<std::unordered_map<std::string, V>>>
// This cache is safe to be shared between threads.
template <typename V>
class TwoLevelCacheWithExpiration {
public:
TwoLevelCacheWithExpiration() {
last_cache_cleanup_timestamp_ = internal::CurrentTimePoint();
}
std::shared_ptr<ConcurrentMap<std::string, V>> GetOrCreateInternalCache(
const std::string& access_token, double cache_entry_lifetime_seconds) {
auto lock = mutex_.Lock();
auto external_cache_entry = cache_.find(access_token);
if (external_cache_entry == cache_.end() ||
external_cache_entry->second.IsExpired()) {
cache_.insert({access_token, internal::ExpiringCacheMapEntry<V>(
std::make_shared<ConcurrentMap<std::string, V>>(),
cache_entry_lifetime_seconds)});
}
return cache_[access_token].cached_item();
}
void CheckCacheForExpiredTokens(double cache_cleanup_period_seconds = 0.0) {
auto lock = mutex_.Lock();
const auto now = internal::CurrentTimePoint();
if (now > (last_cache_cleanup_timestamp_ +
std::chrono::duration<double>(cache_cleanup_period_seconds))) {
RemoveExpiredEntriesNoMutex();
last_cache_cleanup_timestamp_ = now;
}
}
void Remove(const std::string& access_token) {
auto lock = mutex_.Lock();
cache_.erase(access_token);
}
void Clear() {
auto lock = mutex_.Lock();
cache_.clear();
}
private:
void RemoveExpiredEntriesNoMutex() {
for (auto it = cache_.begin(); it != cache_.end();) {
if (it->second.IsExpired()) {
it = cache_.erase(it);
} else {
++it;
}
}
}
std::unordered_map<std::string, internal::ExpiringCacheMapEntry<V>> cache_;
internal::TimePoint last_cache_cleanup_timestamp_;
::arrow::util::Mutex mutex_;
};
} // namespace parquet::encryption

View File

@@ -0,0 +1,28 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
namespace parquet {
class Decryptor;
class Encryptor;
class InternalFileDecryptor;
class InternalFileEncryptor;
} // namespace parquet