Initial commit

This commit is contained in:
kdusek
2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions

View File

@@ -0,0 +1,34 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/util/config.h" // IWYU pragma: export
#include "arrow/filesystem/filesystem.h" // IWYU pragma: export
#ifdef ARROW_AZURE
# include "arrow/filesystem/azurefs.h" // IWYU pragma: export
#endif
#ifdef ARROW_GCS
# include "arrow/filesystem/gcsfs.h" // IWYU pragma: export
#endif
#include "arrow/filesystem/hdfs.h" // IWYU pragma: export
#include "arrow/filesystem/localfs.h" // IWYU pragma: export
#include "arrow/filesystem/mockfs.h" // IWYU pragma: export
#ifdef ARROW_S3
# include "arrow/filesystem/s3fs.h" // IWYU pragma: export
#endif

View File

@@ -0,0 +1,373 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/util/macros.h"
#include "arrow/util/uri.h"
namespace Azure::Core::Credentials {
class TokenCredential;
}
namespace Azure::Storage {
class StorageSharedKeyCredential;
}
namespace Azure::Storage::Blobs {
class BlobServiceClient;
}
namespace Azure::Storage::Files::DataLake {
class DataLakeFileSystemClient;
class DataLakeServiceClient;
} // namespace Azure::Storage::Files::DataLake
namespace arrow::fs {
class TestAzureFileSystem;
class TestAzureOptions;
/// Options for the AzureFileSystem implementation.
///
/// By default, authentication is handled by the Azure SDK's credential chain
/// which may read from multiple environment variables, such as:
/// - `AZURE_TENANT_ID`
/// - `AZURE_CLIENT_ID`
/// - `AZURE_CLIENT_SECRET`
/// - `AZURE_AUTHORITY_HOST`
/// - `AZURE_CLIENT_CERTIFICATE_PATH`
/// - `AZURE_FEDERATED_TOKEN_FILE`
///
/// Functions are provided for explicit configuration of credentials if that is preferred.
struct ARROW_EXPORT AzureOptions {
friend class TestAzureOptions;
/// \brief The name of the Azure Storage Account being accessed.
///
/// All service URLs will be constructed using this storage account name.
/// `ConfigureAccountKeyCredential` assumes the user wants to authenticate
/// this account.
std::string account_name;
/// \brief hostname[:port] of the Azure Blob Storage Service.
///
/// If the hostname is a relative domain name (one that starts with a '.'), then storage
/// account URLs will be constructed by prepending the account name to the hostname.
/// If the hostname is a fully qualified domain name, then the hostname will be used
/// as-is and the account name will follow the hostname in the URL path.
///
/// Default: ".blob.core.windows.net"
std::string blob_storage_authority = ".blob.core.windows.net";
/// \brief hostname[:port] of the Azure Data Lake Storage Gen 2 Service.
///
/// If the hostname is a relative domain name (one that starts with a '.'), then storage
/// account URLs will be constructed by prepending the account name to the hostname.
/// If the hostname is a fully qualified domain name, then the hostname will be used
/// as-is and the account name will follow the hostname in the URL path.
///
/// Default: ".dfs.core.windows.net"
std::string dfs_storage_authority = ".dfs.core.windows.net";
/// \brief Azure Blob Storage connection transport.
///
/// Default: "https"
std::string blob_storage_scheme = "https";
/// \brief Azure Data Lake Storage Gen 2 connection transport.
///
/// Default: "https"
std::string dfs_storage_scheme = "https";
// TODO(GH-38598): Add support for more auth methods.
// std::string connection_string;
// std::string sas_token;
/// \brief Default metadata for OpenOutputStream.
///
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
std::shared_ptr<const KeyValueMetadata> default_metadata;
/// Whether OutputStream writes will be issued in the background, without blocking.
bool background_writes = true;
private:
enum class CredentialKind {
kDefault,
kAnonymous,
kStorageSharedKey,
kSASToken,
kClientSecret,
kManagedIdentity,
kCLI,
kWorkloadIdentity,
kEnvironment,
} credential_kind_ = CredentialKind::kDefault;
std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
storage_shared_key_credential_;
std::string sas_token_;
mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_;
public:
AzureOptions();
~AzureOptions();
private:
void ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path);
Status ExtractFromUriQuery(const Uri& uri);
public:
/// \brief Construct a new AzureOptions from an URI.
///
/// Supported formats:
///
/// 1. abfs[s]://\<account\>.blob.core.windows.net[/\<container\>[/\<path\>]]
/// 2. abfs[s]://\<container\>\@\<account\>.dfs.core.windows.net[/path]
/// 3. abfs[s]://[\<account@]\<host[.domain]\>[\<:port\>][/\<container\>[/path]]
/// 4. abfs[s]://[\<account@]\<container\>[/path]
///
/// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs
/// [1], (3) is for Azure Blob Storage compatible service including Azurite,
/// and (4) is a shorter version of (1) and (2).
///
/// Note that there is no difference between abfs and abfss. HTTPS is
/// used with abfs by default. You can force to use HTTP by specifying
/// "enable_tls=false" query.
///
/// Supported query parameters:
///
/// * blob_storage_authority: Set AzureOptions::blob_storage_authority
/// * dfs_storage_authority: Set AzureOptions::dfs_storage_authority
/// * enable_tls: If it's "false" or "0", HTTP not HTTPS is used.
/// * credential_kind: One of "default", "anonymous", "workload_identity",
/// "environment" or "cli". If "default" is specified, it's
/// just ignored. If "anonymous" is specified,
/// AzureOptions::ConfigureAnonymousCredential() is called. If
/// "workload_identity" is specified,
/// AzureOptions::ConfigureWorkloadIdentityCredential() is called. If
/// "environment" is specified,
/// AzureOptions::ConfigureEnvironmentCredential() is called. If "cli" is
/// specified, AzureOptions::ConfigureCLICredential() is called.
/// * tenant_id: You must specify "client_id" and "client_secret"
/// too. AzureOptions::ConfigureClientSecretCredential() is called.
/// * client_id: If you don't specify "tenant_id" and
/// "client_secret",
/// AzureOptions::ConfigureManagedIdentityCredential() is
/// called. If you specify "tenant_id" and "client_secret" too,
/// AzureOptions::ConfigureClientSecretCredential() is called.
/// * client_secret: You must specify "tenant_id" and "client_id"
/// too. AzureOptions::ConfigureClientSecretCredential() is called.
/// * A SAS token is made up of several query parameters. Appending a SAS
/// token to the URI configures SAS token auth by calling
/// AzureOptions::ConfigureSASCredential().
///
/// [1]:
/// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
static Result<AzureOptions> FromUri(const Uri& uri, std::string* out_path);
static Result<AzureOptions> FromUri(const std::string& uri, std::string* out_path);
Status ConfigureDefaultCredential();
Status ConfigureAnonymousCredential();
Status ConfigureAccountKeyCredential(const std::string& account_key);
Status ConfigureSASCredential(const std::string& sas_token);
Status ConfigureClientSecretCredential(const std::string& tenant_id,
const std::string& client_id,
const std::string& client_secret);
Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string());
Status ConfigureCLICredential();
Status ConfigureWorkloadIdentityCredential();
Status ConfigureEnvironmentCredential();
bool Equals(const AzureOptions& other) const;
std::string AccountBlobUrl(const std::string& account_name) const;
std::string AccountDfsUrl(const std::string& account_name) const;
Result<std::unique_ptr<Azure::Storage::Blobs::BlobServiceClient>>
MakeBlobServiceClient() const;
Result<std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>>
MakeDataLakeServiceClient() const;
};
/// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and
/// Azure Data Lake Storage Gen2 (ADLS Gen2) [2].
///
/// ADLS Gen2 isn't a dedicated service or account type. It's a set of capabilities that
/// support high throughput analytic workloads, built on Azure Blob Storage. All the data
/// ingested via the ADLS Gen2 APIs is persisted as blobs in the storage account.
/// ADLS Gen2 provides filesystem semantics, file-level security, and Hadoop
/// compatibility. ADLS Gen1 exists as a separate object that will retired on 2024-02-29
/// and new ADLS accounts use Gen2 instead.
///
/// ADLS Gen2 and Blob APIs can operate on the same data, but there are
/// some limitations [3]. The ones that are relevant to this
/// implementation are listed here:
///
/// - You can't use Blob APIs, and ADLS APIs to write to the same instance of a file. If
/// you write to a file by using ADLS APIs then that file's blocks won't be visible
/// to calls to the GetBlockList Blob API. The only exception is when you're
/// overwriting.
/// - When you use the ListBlobs operation without specifying a delimiter, the results
/// include both directories and blobs. If you choose to use a delimiter, use only a
/// forward slash (/) \--- the only supported delimiter.
/// - If you use the DeleteBlob API to delete a directory, that directory is deleted only
/// if it's empty. This means that you can't use the Blob API delete directories
/// recursively.
///
/// [1]: https://azure.microsoft.com/en-us/products/storage/blobs
/// [2]: https://azure.microsoft.com/en-us/products/storage/data-lake-storage
/// [3]:
/// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues
class ARROW_EXPORT AzureFileSystem : public FileSystem {
private:
class Impl;
std::unique_ptr<Impl> impl_;
explicit AzureFileSystem(std::unique_ptr<Impl>&& impl);
friend class TestAzureFileSystem;
void ForceCachedHierarchicalNamespaceSupport(int hns_support);
public:
~AzureFileSystem() override = default;
static Result<std::shared_ptr<AzureFileSystem>> Make(
const AzureOptions& options, const io::IOContext& = io::default_io_context());
std::string type_name() const override { return "abfs"; }
/// Return the original Azure options when constructing the filesystem
const AzureOptions& options() const;
bool Equals(const FileSystem& other) const override;
/// \cond FALSE
using FileSystem::CreateDir;
using FileSystem::DeleteDirContents;
using FileSystem::GetFileInfo;
using FileSystem::OpenAppendStream;
using FileSystem::OpenOutputStream;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
/// \brief Delete a directory and its contents recursively.
///
/// Atomicity is guaranteed only on Hierarchical Namespace Storage accounts.
Status DeleteDir(const std::string& path) override;
/// \brief Non-atomically deletes the contents of a directory.
///
/// This function can return a bad Status after only partially deleting the
/// contents of the directory.
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
/// \brief Deletion of all the containers in the storage account (not
/// implemented for safety reasons).
///
/// \return Status::NotImplemented
Status DeleteRootDirContents() override;
/// \brief Deletes a file.
///
/// Supported on both flat namespace and Hierarchical Namespace storage
/// accounts. A check is made to guarantee the parent directory doesn't
/// disappear after the blob is deleted and while this operation is running,
/// no other client can delete the parent directory due to the use of leases.
///
/// This means applications can safely retry this operation without coordination to
/// guarantee only one client/process is trying to delete the same file.
Status DeleteFile(const std::string& path) override;
/// \brief Move/rename a file or directory.
///
/// There are no files immediately at the root directory, so paths like
/// "/segment" always refer to a container of the storage account and are
/// treated as directories.
///
/// If `dest` exists but the operation fails for some reason, `Move`
/// guarantees `dest` is not lost.
///
/// Conditions for a successful move:
///
/// 1. `src` must exist.
/// 2. `dest` can't contain a strict path prefix of `src`. More generally,
/// a directory can't be made a subdirectory of itself.
/// 3. If `dest` already exists and it's a file, `src` must also be a file.
/// `dest` is then replaced by `src`.
/// 4. All components of `dest` must exist, except for the last.
/// 5. If `dest` already exists and it's a directory, `src` must also be a
/// directory and `dest` must be empty. `dest` is then replaced by `src`
/// and its contents.
///
/// Leases are used to guarantee the pre-condition checks and the rename
/// operation are atomic: other clients can't invalidate the pre-condition in
/// the time between the checks and the actual rename operation.
///
/// This is possible because Move() is only support on storage accounts with
/// Hierarchical Namespace Support enabled.
///
/// ## Limitations
///
/// - Moves are not supported on storage accounts without
/// Hierarchical Namespace support enabled
/// - Moves across different containers are not supported
/// - Moving a path of the form `/container` is not supported as it would
/// require moving all the files in a container to another container.
/// The only exception is a `Move("/container_a", "/container_b")` where
/// both containers are empty or `container_b` doesn't even exist.
/// The atomicity of the emptiness checks followed by the renaming operation
/// is guaranteed by the use of leases.
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
};
} // namespace arrow::fs

View File

@@ -0,0 +1,723 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <chrono>
#include <cstdint>
#include <functional>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "arrow/filesystem/type_fwd.h"
#include "arrow/io/interfaces.h"
#include "arrow/type_fwd.h"
#include "arrow/util/compare.h"
#include "arrow/util/macros.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
#include "arrow/util/windows_fixup.h"
namespace arrow {
namespace fs {
using arrow::util::Uri;
// A system clock time point expressed as a 64-bit (or more) number of
// nanoseconds since the epoch.
using TimePoint =
std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
ARROW_EXPORT std::string ToString(FileType);
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType);
static const int64_t kNoSize = -1;
static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1));
/// \brief FileSystem entry info
struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> {
FileInfo() = default;
FileInfo(FileInfo&&) = default;
FileInfo& operator=(FileInfo&&) = default;
FileInfo(const FileInfo&) = default;
FileInfo& operator=(const FileInfo&) = default;
explicit FileInfo(std::string path, FileType type = FileType::Unknown)
: path_(std::move(path)), type_(type) {}
/// The file type
FileType type() const { return type_; }
void set_type(FileType type) { type_ = type; }
/// The full file path in the filesystem
const std::string& path() const { return path_; }
void set_path(std::string path) { path_ = std::move(path); }
/// The file base name (component after the last directory separator)
std::string base_name() const;
// The directory base name (component before the file base name).
std::string dir_name() const;
/// The size in bytes, if available
///
/// Only regular files are guaranteed to have a size.
int64_t size() const { return size_; }
void set_size(int64_t size) { size_ = size; }
/// The file extension (excluding the dot)
std::string extension() const;
/// The time of last modification, if available
TimePoint mtime() const { return mtime_; }
void set_mtime(TimePoint mtime) { mtime_ = mtime; }
bool IsFile() const { return type_ == FileType::File; }
bool IsDirectory() const { return type_ == FileType::Directory; }
bool Equals(const FileInfo& other) const {
return type() == other.type() && path() == other.path() && size() == other.size() &&
mtime() == other.mtime();
}
std::string ToString() const;
/// Function object implementing less-than comparison and hashing by
/// path, to support sorting infos, using them as keys, and other
/// interactions with the STL.
struct ByPath {
bool operator()(const FileInfo& l, const FileInfo& r) const {
return l.path() < r.path();
}
size_t operator()(const FileInfo& i) const {
return std::hash<std::string>{}(i.path());
}
};
protected:
std::string path_;
FileType type_ = FileType::Unknown;
int64_t size_ = kNoSize;
TimePoint mtime_ = kNoTime;
};
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&);
/// \brief File selector for filesystem APIs
struct ARROW_EXPORT FileSelector {
/// The directory in which to select files.
/// If the path exists but doesn't point to a directory, this should be an error.
std::string base_dir;
/// The behavior if `base_dir` isn't found in the filesystem. If false,
/// an error is returned. If true, an empty selection is returned.
bool allow_not_found;
/// Whether to recurse into subdirectories.
bool recursive;
/// The maximum number of subdirectories to recurse into.
int32_t max_recursion;
FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {}
};
/// \brief FileSystem, path pair
struct ARROW_EXPORT FileLocator {
std::shared_ptr<FileSystem> filesystem;
std::string path;
};
using FileInfoVector = std::vector<FileInfo>;
using FileInfoGenerator = std::function<Future<FileInfoVector>()>;
} // namespace fs
template <>
struct IterationTraits<fs::FileInfoVector> {
static fs::FileInfoVector End() { return {}; }
static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); }
};
namespace fs {
/// \brief Abstract file system API
class ARROW_EXPORT FileSystem
/// \cond false
: public std::enable_shared_from_this<FileSystem>
/// \endcond
{ // NOLINT
public:
virtual ~FileSystem();
virtual std::string type_name() const = 0;
/// EXPERIMENTAL: The IOContext associated with this filesystem.
const io::IOContext& io_context() const { return io_context_; }
/// Normalize path for the given filesystem
///
/// The default implementation of this method is a no-op, but subclasses
/// may allow normalizing irregular path forms (such as Windows local paths).
virtual Result<std::string> NormalizePath(std::string path);
/// \brief Ensure a URI (or path) is compatible with the given filesystem and return the
/// path
///
/// \param uri_string A URI representing a resource in the given filesystem.
///
/// This method will check to ensure the given filesystem is compatible with the
/// URI. This can be useful when the user provides both a URI and a filesystem or
/// when a user provides multiple URIs that should be compatible with the same
/// filesystem.
///
/// uri_string can be an absolute path instead of a URI. In that case it will ensure
/// the filesystem (if supplied) is the local filesystem (or some custom filesystem that
/// is capable of reading local paths) and will normalize the path's file separators.
///
/// Note, this method only checks to ensure the URI scheme is valid. It will not detect
/// inconsistencies like a mismatching region or endpoint override.
///
/// \return The path inside the filesystem that is indicated by the URI.
virtual Result<std::string> PathFromUri(const std::string& uri_string) const;
/// \brief Make a URI from which FileSystemFromUri produces an equivalent filesystem
/// \param path The path component to use in the resulting URI. Must be absolute.
/// \return A URI string, or an error if an equivalent URI cannot be produced
virtual Result<std::string> MakeUri(std::string path) const;
virtual bool Equals(const FileSystem& other) const = 0;
virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
return Equals(*other);
}
/// Get info for the given target.
///
/// Any symlink is automatically dereferenced, recursively.
/// A nonexistent or unreachable file returns an Ok status and
/// has a FileType of value NotFound. An error status indicates
/// a truly exceptional condition (low-level I/O error, etc.).
virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0;
/// Same, for many targets at once.
virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>& paths);
/// Same, according to a selector.
///
/// The selector's base directory will not be part of the results, even if
/// it exists.
/// If it doesn't exist, see `FileSelector::allow_not_found`.
virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0;
/// Async version of GetFileInfo
virtual Future<FileInfoVector> GetFileInfoAsync(const std::vector<std::string>& paths);
/// Streaming async version of GetFileInfo
///
/// The returned generator is not async-reentrant, i.e. you need to wait for
/// the returned future to complete before calling the generator again.
virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select);
/// Create a directory and subdirectories.
///
/// This function succeeds if the directory already exists.
virtual Status CreateDir(const std::string& path, bool recursive) = 0;
Status CreateDir(const std::string& path) { return CreateDir(path, true); }
/// Delete a directory and its contents, recursively.
virtual Status DeleteDir(const std::string& path) = 0;
/// Delete a directory's contents, recursively.
///
/// Like DeleteDir, but doesn't delete the directory itself.
/// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents.
virtual Status DeleteDirContents(const std::string& path, bool missing_dir_ok) = 0;
Status DeleteDirContents(const std::string& path) {
return DeleteDirContents(path, false);
}
/// Async version of DeleteDirContents.
virtual Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok);
/// Async version of DeleteDirContents.
///
/// This overload allows missing directories.
Future<> DeleteDirContentsAsync(const std::string& path);
/// EXPERIMENTAL: Delete the root directory's contents, recursively.
///
/// Implementations may decide to raise an error if this operation is
/// too dangerous.
// NOTE: may decide to remove this if it's deemed not useful
virtual Status DeleteRootDirContents() = 0;
/// Delete a file.
virtual Status DeleteFile(const std::string& path) = 0;
/// Delete many files.
///
/// The default implementation issues individual delete operations in sequence.
virtual Status DeleteFiles(const std::vector<std::string>& paths);
/// Move / rename a file or directory.
///
/// If the destination exists:
/// - if it is a non-empty directory, an error is returned
/// - otherwise, if it has the same type as the source, it is replaced
/// - otherwise, behavior is unspecified (implementation-dependent).
virtual Status Move(const std::string& src, const std::string& dest) = 0;
/// Copy a file.
///
/// If the destination exists and is a directory, an error is returned.
/// Otherwise, it is replaced.
virtual Status CopyFile(const std::string& src, const std::string& dest) = 0;
/// Open an input stream for sequential reading.
virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) = 0;
/// Open an input stream for sequential reading.
///
/// This override assumes the given FileInfo validly represents the file's
/// characteristics, and may optimize access depending on them (for example
/// avoid querying the file size or its existence).
virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info);
/// Open an input file for random access reading.
virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) = 0;
/// Open an input file for random access reading.
///
/// This override assumes the given FileInfo validly represents the file's
/// characteristics, and may optimize access depending on them (for example
/// avoid querying the file size or its existence).
virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info);
/// Async version of OpenInputStream
virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const std::string& path);
/// Async version of OpenInputStream
virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const FileInfo& info);
/// Async version of OpenInputFile
virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const std::string& path);
/// Async version of OpenInputFile
virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const FileInfo& info);
/// Open an output stream for sequential writing.
///
/// If the target already exists, existing data is truncated.
virtual Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(const std::string& path);
/// Open an output stream for appending.
///
/// If the target doesn't exist, a new empty file is created.
///
/// Note: some filesystem implementations do not support efficient appending
/// to an existing file, in which case this method will return NotImplemented.
/// Consider writing to multiple files (using e.g. the dataset layer) instead.
virtual Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const std::string& path);
protected:
explicit FileSystem(io::IOContext io_context = io::default_io_context())
: io_context_(std::move(io_context)) {}
io::IOContext io_context_;
// Whether metadata operations (such as GetFileInfo or OpenInputStream)
// are cheap enough that the default async variants don't bother with
// a thread pool.
bool default_async_is_sync_ = true;
};
struct FileSystemFactory {
std::function<Result<std::shared_ptr<FileSystem>>(
const Uri& uri, const io::IOContext& io_context, std::string* out_path)>
function;
std::string_view file;
int line;
bool operator==(const FileSystemFactory& other) const {
// In the case where libarrow is linked statically both to the executable and to a
// dynamically loaded filesystem implementation library, the library contains a
// duplicate definition of the registry and duplicate definitions of any
// FileSystemRegistrars which are statically linked to libarrow. When retrieving
// factories from the filesystem implementation library, we use the file and line
// of the registrar's definition to determine equivalence of the duplicate factories.
return file == other.file && line == other.line;
}
};
/// \brief A FileSystem implementation that delegates to another
/// implementation after prepending a fixed base path.
///
/// This is useful to expose a logical view of a subtree of a filesystem,
/// for example a directory in a LocalFileSystem.
/// This works on abstract paths, i.e. paths using forward slashes and
/// and a single root "/". Windows paths are not guaranteed to work.
/// This makes no security guarantee. For example, symlinks may allow to
/// "escape" the subtree and access other parts of the underlying filesystem.
class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
public:
// This constructor may abort if base_path is invalid.
explicit SubTreeFileSystem(const std::string& base_path,
std::shared_ptr<FileSystem> base_fs);
~SubTreeFileSystem() override;
std::string type_name() const override { return "subtree"; }
std::string base_path() const { return base_path_; }
std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
Result<std::string> NormalizePath(std::string path) override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
bool Equals(const FileSystem& other) const override;
/// \cond FALSE
using FileSystem::CreateDir;
using FileSystem::DeleteDirContents;
using FileSystem::GetFileInfo;
using FileSystem::OpenAppendStream;
using FileSystem::OpenOutputStream;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const std::string& path) override;
Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
const FileInfo& info) override;
Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const std::string& path) override;
Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
const FileInfo& info) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
protected:
SubTreeFileSystem() = default;
const std::string base_path_;
std::shared_ptr<FileSystem> base_fs_;
Result<std::string> PrependBase(const std::string& s) const;
Result<std::string> PrependBaseNonEmpty(const std::string& s) const;
Result<std::string> StripBase(const std::string& s) const;
Status FixInfo(FileInfo* info) const;
static Result<std::string> NormalizeBasePath(
std::string base_path, const std::shared_ptr<FileSystem>& base_fs);
};
/// \brief A FileSystem implementation that delegates to another
/// implementation but inserts latencies at various points.
class ARROW_EXPORT SlowFileSystem : public FileSystem {
public:
SlowFileSystem(std::shared_ptr<FileSystem> base_fs,
std::shared_ptr<io::LatencyGenerator> latencies);
SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency);
SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency,
int32_t seed);
std::string type_name() const override { return "slow"; }
bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
/// \cond FALSE
using FileSystem::CreateDir;
using FileSystem::DeleteDirContents;
using FileSystem::GetFileInfo;
using FileSystem::OpenAppendStream;
using FileSystem::OpenOutputStream;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
protected:
std::shared_ptr<FileSystem> base_fs_;
std::shared_ptr<io::LatencyGenerator> latencies_;
};
/// \brief Ensure all registered filesystem implementations are finalized.
///
/// Individual finalizers may wait for concurrent calls to finish so as to avoid
/// race conditions. After this function has been called, all filesystem APIs
/// will fail with an error.
///
/// The user is responsible for synchronization of calls to this function.
void EnsureFinalized();
/// \defgroup filesystem-factories Functions for creating FileSystem instances
///
/// @{
/// \brief Create a new FileSystem by URI
///
/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3",
/// "gs" and "gcs".
///
/// Support for other schemes can be added using RegisterFileSystemFactory.
///
/// \param[in] uri a URI-based path, ex: file:///some/local/path
/// \param[out] out_path (optional) Path inside the filesystem.
/// \return out_fs FileSystem instance.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
std::string* out_path = NULLPTR);
/// \brief Create a new FileSystem by URI with a custom IO context
///
/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3",
/// "gs" and "gcs".
///
/// Support for other schemes can be added using RegisterFileSystemFactory.
///
/// \param[in] uri a URI-based path, ex: file:///some/local/path
/// \param[in] io_context an IOContext which will be associated with the filesystem
/// \param[out] out_path (optional) Path inside the filesystem.
/// \return out_fs FileSystem instance.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
const io::IOContext& io_context,
std::string* out_path = NULLPTR);
/// \brief Create a new FileSystem by URI
///
/// Support for other schemes can be added using RegisterFileSystemFactory.
///
/// Same as FileSystemFromUri, but in addition also recognize non-URIs
/// and treat them as local filesystem paths. Only absolute local filesystem
/// paths are allowed.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
const std::string& uri, std::string* out_path = NULLPTR);
/// \brief Create a new FileSystem by URI with a custom IO context
///
/// Support for other schemes can be added using RegisterFileSystemFactory.
///
/// Same as FileSystemFromUri, but in addition also recognize non-URIs
/// and treat them as local filesystem paths. Only absolute local filesystem
/// paths are allowed.
ARROW_EXPORT
Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
const std::string& uri, const io::IOContext& io_context,
std::string* out_path = NULLPTR);
/// @}
/// \defgroup filesystem-factory-registration Helpers for FileSystem registration
///
/// @{
/// \brief Register a FileSystem factory
///
/// Support for custom URI schemes can be added by registering a factory
/// for the corresponding FileSystem.
///
/// \param[in] scheme a Uri scheme which the factory will handle.
/// If a factory has already been registered for a scheme,
/// the new factory will be ignored.
/// \param[in] factory a function which can produce a FileSystem for Uris which match
/// scheme.
/// \param[in] finalizer a function which must be called to finalize the factory before
/// the process exits, or nullptr if no finalization is necessary.
/// \return raises KeyError if a name collision occurs.
ARROW_EXPORT Status RegisterFileSystemFactory(std::string scheme,
FileSystemFactory factory,
std::function<void()> finalizer = {});
/// \brief Register FileSystem factories from a shared library
///
/// FileSystem implementations may be housed in separate shared libraries and only
/// registered when the shared library is explicitly loaded. FileSystemRegistrar is
/// provided to simplify definition of such libraries: each instance at namespace scope
/// in the library will register a factory for a scheme. Any library which uses
/// FileSystemRegistrars and which must be dynamically loaded should be loaded using
/// LoadFileSystemFactories(), which will additionally merge registries are if necessary
/// (static linkage to arrow can produce isolated registries).
ARROW_EXPORT Status LoadFileSystemFactories(const char* libpath);
struct ARROW_EXPORT FileSystemRegistrar {
/// \brief Register a FileSystem factory at load time
///
/// Support for custom URI schemes can be added by registering a factory for the
/// corresponding FileSystem. An instance of this helper can be defined at namespace
/// scope to cause the factory to be registered at load time.
///
/// Global constructors will finish execution before main() starts if the registrar is
/// linked into the same binary as main(), or before dlopen()/LoadLibrary() returns if
/// the library in which the registrar is defined is dynamically loaded.
///
/// \code
/// FileSystemRegistrar kSlowFileSystemModule{
/// "slowfile",
/// [](const Uri& uri, const io::IOContext& io_context, std::string* out_path)
/// ->Result<std::shared_ptr<FileSystem>> {
/// auto local_uri = "file" + uri.ToString().substr(uri.scheme().size());
/// ARROW_ASSIGN_OR_RAISE(auto base_fs,
/// FileSystemFromUri(local_uri, io_context, out_path));
/// double average_latency = 1;
/// int32_t seed = 0xDEADBEEF;
/// ARROW_ASSIGN_OR_RAISE(auto params, uri.query_item());
/// for (const auto& [key, value] : params) {
/// if (key == "average_latency") {
/// average_latency = std::stod(value);
/// }
/// if (key == "seed") {
/// seed = std::stoi(value, nullptr, /*base=*/16);
/// }
/// }
/// return std::make_shared<SlowFileSystem>(base_fs, average_latency, seed);
/// }));
/// \endcode
///
/// \param[in] scheme a Uri scheme which the factory will handle.
/// If a factory has already been registered for a scheme, the
/// new factory will be ignored.
/// \param[in] factory a function which can produce a FileSystem for Uris which match
/// scheme.
/// \param[in] finalizer a function which must be called to finalize the factory before
/// the process exits, or nullptr if no finalization is necessary.
FileSystemRegistrar(std::string scheme, FileSystemFactory factory,
std::function<void()> finalizer = {});
};
#define ARROW_REGISTER_FILESYSTEM(scheme, factory_function, finalizer) \
::arrow::fs::FileSystemRegistrar { \
scheme, ::arrow::fs::FileSystemFactory{factory_function, __FILE__, __LINE__}, \
finalizer \
}
/// @}
namespace internal {
ARROW_EXPORT void* GetFileSystemRegistry();
} // namespace internal
/// \brief Copy files, including from one FileSystem to another
///
/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile
/// will be used, otherwise the file will be opened as a stream in both FileSystems and
/// chunks copied from the source to the destination. No directories will be created.
ARROW_EXPORT
Status CopyFiles(const std::vector<FileLocator>& sources,
const std::vector<FileLocator>& destinations,
const io::IOContext& io_context = io::default_io_context(),
int64_t chunk_size = 1024 * 1024, bool use_threads = true);
/// \brief Copy selected files, including from one FileSystem to another
///
/// Directories will be created under the destination base directory as needed.
ARROW_EXPORT
Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs,
const FileSelector& source_sel,
const std::shared_ptr<FileSystem>& destination_fs,
const std::string& destination_base_dir,
const io::IOContext& io_context = io::default_io_context(),
int64_t chunk_size = 1024 * 1024, bool use_threads = true);
struct FileSystemGlobalOptions {
/// Path to a single PEM file holding all TLS CA certificates
///
/// If empty, the underlying TLS library's defaults will be used.
std::string tls_ca_file_path;
/// Path to a directory holding TLS CA certificates in individual PEM files
/// named along the OpenSSL "hashed" format.
///
/// If empty, the underlying TLS library's defaults will be used.
std::string tls_ca_dir_path;
};
/// EXPERIMENTAL: optional global initialization routine
///
/// This is for environments (such as manylinux) where the path
/// to TLS CA certificates needs to be configured at runtime.
ARROW_EXPORT
Status Initialize(const FileSystemGlobalOptions& options);
} // namespace fs
} // namespace arrow

View File

@@ -0,0 +1,41 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "arrow/filesystem/filesystem.h"
namespace arrow::fs {
extern "C" {
// ARROW_FORCE_EXPORT ensures this function's visibility is
// _declspec(dllexport)/[[gnu::visibility("default")]] even when
// this header is #included by a non-arrow source, as in a third
// party filesystem implementation.
ARROW_FORCE_EXPORT void* arrow_filesystem_get_registry();
void* arrow_filesystem_get_registry() {
// In the case where libarrow is linked statically both to the executable and to a
// dynamically loaded filesystem implementation library, the library contains a
// duplicate definition of the registry into which the library's instances of
// FileSystemRegistrar insert their factories. This function is made accessible to
// dlsym/GetProcAddress to enable detection of such duplicate registries and merging
// into the registry accessible to the executable.
return internal::GetFileSystemRegistry();
}
}
} // namespace arrow::fs

View File

@@ -0,0 +1,242 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <optional>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/util/uri.h"
namespace arrow {
namespace fs {
namespace internal {
// Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers.
struct GcsCredentialsHolder;
} // namespace internal
class GcsFileSystem;
/// \brief Container for GCS Credentials and information necessary to recreate them.
class ARROW_EXPORT GcsCredentials {
public:
bool Equals(const GcsCredentials& other) const;
bool anonymous() const { return anonymous_; }
const std::string& access_token() const { return access_token_; }
TimePoint expiration() const { return expiration_; }
const std::string& target_service_account() const { return target_service_account_; }
const std::string& json_credentials() const { return json_credentials_; }
const std::shared_ptr<internal::GcsCredentialsHolder>& holder() const {
return holder_;
}
private:
GcsCredentials() = default;
bool anonymous_ = false;
std::string access_token_;
TimePoint expiration_;
std::string target_service_account_;
std::string json_credentials_;
std::shared_ptr<internal::GcsCredentialsHolder> holder_;
friend class GcsFileSystem;
friend struct GcsOptions;
};
/// Options for the GcsFileSystem implementation.
struct ARROW_EXPORT GcsOptions {
/// \brief Equivalent to GcsOptions::Defaults().
GcsOptions();
GcsCredentials credentials;
std::string endpoint_override;
std::string scheme;
/// \brief Location to use for creating buckets.
std::string default_bucket_location;
/// \brief If set used to control total time allowed for retrying underlying
/// errors.
///
/// The default policy is to retry for up to 15 minutes.
std::optional<double> retry_limit_seconds;
/// \brief Default metadata for OpenOutputStream.
///
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
std::shared_ptr<const KeyValueMetadata> default_metadata;
/// \brief The project to use for creating buckets.
///
/// If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
/// variable. Most I/O operations do not need a project id, only applications
/// that create new buckets need a project id.
std::optional<std::string> project_id;
bool Equals(const GcsOptions& other) const;
/// \brief Initialize with Google Default Credentials
///
/// Create options configured to use [Application Default Credentials][aip/4110]. The
/// details of this mechanism are too involved to describe here, but suffice is to say
/// that applications can override any defaults using an environment variable
/// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google
/// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have
/// the same behavior as the `gcloud` CLI tool on your workstation.
///
/// \see https://cloud.google.com/docs/authentication
///
/// [aip/4110]: https://google.aip.dev/auth/4110
static GcsOptions Defaults();
/// \brief Initialize with anonymous credentials
static GcsOptions Anonymous();
/// \brief Initialize with access token
///
/// These credentials are useful when using an out-of-band mechanism to fetch access
/// tokens. Note that access tokens are time limited, you will need to manually refresh
/// the tokens created by the out-of-band mechanism.
static GcsOptions FromAccessToken(const std::string& access_token,
TimePoint expiration);
/// \brief Initialize with service account impersonation
///
/// Service account impersonation allows one principal (a user or service account) to
/// impersonate a service account. It requires that the calling principal has the
/// necessary permissions *on* the service account.
static GcsOptions FromImpersonatedServiceAccount(
const GcsCredentials& base_credentials, const std::string& target_service_account);
/// Creates service account credentials from a JSON object in string form.
///
/// The @p json_object is expected to be in the format described by [aip/4112]. Such an
/// object contains the identity of a service account, as well as a private key that can
/// be used to sign tokens, showing the caller was holding the private key.
///
/// In GCP one can create several "keys" for each service account, and these keys are
/// downloaded as a JSON "key file". The contents of such a file are in the format
/// required by this function. Remember that key files and their contents should be
/// treated as any other secret with security implications, think of them as passwords
/// (because they are!), don't store them or output them where unauthorized persons may
/// read them.
///
/// Most applications should probably use default credentials, maybe pointing them to a
/// file with these contents. Using this function may be useful when the json object is
/// obtained from a Cloud Secret Manager or a similar service.
///
/// [aip/4112]: https://google.aip.dev/auth/4112
static GcsOptions FromServiceAccountCredentials(const std::string& json_object);
/// Initialize from URIs such as "gs://bucket/object".
static Result<GcsOptions> FromUri(const arrow::util::Uri& uri, std::string* out_path);
static Result<GcsOptions> FromUri(const std::string& uri, std::string* out_path);
};
/// \brief GCS-backed FileSystem implementation.
///
/// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object
/// storage system for any amount of data. The main abstractions in GCS are buckets and
/// objects. A bucket is a namespace for objects, buckets can store any number of objects,
/// tens of millions and even billions is not uncommon. Each object contains a single
/// blob of data, up to 5TiB in size. Buckets are typically configured to keep a single
/// version of each object, but versioning can be enabled. Versioning is important because
/// objects are immutable, once created one cannot append data to the object or modify the
/// object data in any way.
///
/// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket
/// named `foo` no other customer can create a bucket with the same name. Note that a
/// principal (a user or service account) may only list the buckets they are entitled to,
/// and then only within a project. It is not possible to list "all" the buckets.
///
/// Within each bucket objects are in flat namespace. GCS does not have folders or
/// directories. However, following some conventions it is possible to emulate
/// directories. To this end, this class:
///
/// - All buckets are treated as directories at the "root"
/// - Creating a root directory results in a new bucket being created, this may be slower
/// than most GCS operations.
/// - The class creates marker objects for a directory, using a metadata attribute to
/// annotate the file.
/// - GCS can list all the objects with a given prefix, this is used to emulate listing
/// of directories.
/// - In object lists GCS can summarize all the objects with a common prefix as a single
/// entry, this is used to emulate non-recursive lists. Note that GCS list time is
/// proportional to the number of objects in the prefix. Listing recursively takes
/// almost the same time as non-recursive lists.
///
class ARROW_EXPORT GcsFileSystem : public FileSystem {
public:
~GcsFileSystem() override = default;
std::string type_name() const override;
const GcsOptions& options() const;
bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
/// This is not implemented in GcsFileSystem, as it would be too dangerous.
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
/// Create a GcsFileSystem instance from the given options.
static Result<std::shared_ptr<GcsFileSystem>> Make(
const GcsOptions& options, const io::IOContext& = io::default_io_context());
private:
explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context);
class Impl;
std::shared_ptr<Impl> impl_;
};
} // namespace fs
} // namespace arrow

View File

@@ -0,0 +1,117 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/io/hdfs.h"
#include "arrow/util/uri.h"
namespace arrow::fs {
/// Options for the HDFS implementation.
struct ARROW_EXPORT HdfsOptions {
HdfsOptions() = default;
~HdfsOptions() = default;
/// Hdfs configuration options, contains host, port, driver
io::HdfsConnectionConfig connection_config;
/// Used by Hdfs OpenWritable Interface.
int32_t buffer_size = 0;
int16_t replication = 3;
int64_t default_block_size = 0;
void ConfigureEndPoint(std::string host, int port);
void ConfigureReplication(int16_t replication);
void ConfigureUser(std::string user_name);
void ConfigureBufferSize(int32_t buffer_size);
void ConfigureBlockSize(int64_t default_block_size);
void ConfigureKerberosTicketCachePath(std::string path);
void ConfigureExtraConf(std::string key, std::string val);
bool Equals(const HdfsOptions& other) const;
static Result<HdfsOptions> FromUri(const ::arrow::util::Uri& uri);
static Result<HdfsOptions> FromUri(const std::string& uri);
};
/// HDFS-backed FileSystem implementation.
///
/// implementation notes:
/// - This is a wrapper of arrow/io/hdfs, so we can use FileSystem API to handle hdfs.
class ARROW_EXPORT HadoopFileSystem : public FileSystem {
public:
~HadoopFileSystem() override;
std::string type_name() const override { return "hdfs"; }
HdfsOptions options() const;
bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
/// \cond FALSE
using FileSystem::CreateDir;
using FileSystem::DeleteDirContents;
using FileSystem::GetFileInfo;
using FileSystem::OpenAppendStream;
using FileSystem::OpenOutputStream;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
/// Create a HdfsFileSystem instance from the given options.
static Result<std::shared_ptr<HadoopFileSystem>> Make(
const HdfsOptions& options, const io::IOContext& = io::default_io_context());
protected:
HadoopFileSystem(const HdfsOptions& options, const io::IOContext&);
class Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace arrow::fs

View File

@@ -0,0 +1,132 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
namespace arrow {
namespace internal {
class Uri;
}
namespace fs {
/// Options for the LocalFileSystem implementation.
struct ARROW_EXPORT LocalFileSystemOptions {
static constexpr int32_t kDefaultDirectoryReadahead = 16;
static constexpr int32_t kDefaultFileInfoBatchSize = 1000;
/// Whether OpenInputStream and OpenInputFile return a mmap'ed file,
/// or a regular one.
bool use_mmap = false;
/// Options related to `GetFileInfoGenerator` interface.
/// EXPERIMENTAL: The maximum number of directories processed in parallel
/// by `GetFileInfoGenerator`.
int32_t directory_readahead = kDefaultDirectoryReadahead;
/// EXPERIMENTAL: The maximum number of entries aggregated into each
/// FileInfoVector chunk by `GetFileInfoGenerator`.
///
/// Since each FileInfo entry needs a separate `stat` system call, a
/// directory with a very large number of files may take a lot of time to
/// process entirely. By generating a FileInfoVector after this chunk
/// size is reached, we ensure FileInfo entries can start being consumed
/// from the FileInfoGenerator with less initial latency.
int32_t file_info_batch_size = kDefaultFileInfoBatchSize;
/// \brief Initialize with defaults
static LocalFileSystemOptions Defaults();
bool Equals(const LocalFileSystemOptions& other) const;
static Result<LocalFileSystemOptions> FromUri(const ::arrow::util::Uri& uri,
std::string* out_path);
};
/// \brief A FileSystem implementation accessing files on the local machine.
///
/// This class handles only `/`-separated paths. If desired, conversion
/// from Windows backslash-separated paths should be done by the caller.
/// Details such as symlinks are abstracted away (symlinks are always
/// followed, except when deleting an entry).
class ARROW_EXPORT LocalFileSystem : public FileSystem {
public:
explicit LocalFileSystem(const io::IOContext& = io::default_io_context());
explicit LocalFileSystem(const LocalFileSystemOptions&,
const io::IOContext& = io::default_io_context());
~LocalFileSystem() override;
std::string type_name() const override { return "local"; }
Result<std::string> NormalizePath(std::string path) override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
Result<std::string> MakeUri(std::string path) const override;
bool Equals(const FileSystem& other) const override;
LocalFileSystemOptions options() const { return options_; }
/// \cond FALSE
using FileSystem::CreateDir;
using FileSystem::DeleteDirContents;
using FileSystem::GetFileInfo;
using FileSystem::OpenAppendStream;
using FileSystem::OpenOutputStream;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
protected:
LocalFileSystemOptions options_;
};
} // namespace fs
} // namespace arrow

View File

@@ -0,0 +1,134 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <iosfwd>
#include <memory>
#include <string>
#include <string_view>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/util/windows_fixup.h"
namespace arrow::fs::internal {
struct MockDirInfo {
std::string full_path;
TimePoint mtime;
bool operator==(const MockDirInfo& other) const {
return mtime == other.mtime && full_path == other.full_path;
}
ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream&, const MockDirInfo&);
};
struct MockFileInfo {
std::string full_path;
TimePoint mtime;
std::string_view data;
bool operator==(const MockFileInfo& other) const {
return mtime == other.mtime && full_path == other.full_path && data == other.data;
}
ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream&, const MockFileInfo&);
};
/// A mock FileSystem implementation that holds its contents in memory.
///
/// Useful for validating the FileSystem API, writing conformance suite,
/// and bootstrapping FileSystem-based APIs.
class ARROW_EXPORT MockFileSystem : public FileSystem {
public:
explicit MockFileSystem(TimePoint current_time,
const io::IOContext& = io::default_io_context());
~MockFileSystem() override;
std::string type_name() const override { return "mock"; }
bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
/// \cond FALSE
using FileSystem::CreateDir;
using FileSystem::DeleteDirContents;
using FileSystem::GetFileInfo;
using FileSystem::OpenAppendStream;
using FileSystem::OpenOutputStream;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
// Contents-dumping helpers to ease testing.
// Output is lexicographically-ordered by full path.
std::vector<MockDirInfo> AllDirs();
std::vector<MockFileInfo> AllFiles();
// Create a File with a content from a string.
Status CreateFile(const std::string& path, std::string_view content,
bool recursive = true);
// Create a MockFileSystem out of (empty) FileInfo. The content of every
// file is empty and of size 0. All directories will be created recursively.
static Result<std::shared_ptr<FileSystem>> Make(TimePoint current_time,
const std::vector<FileInfo>& infos);
class Impl;
protected:
std::unique_ptr<Impl> impl_;
};
class ARROW_EXPORT MockAsyncFileSystem : public MockFileSystem {
public:
explicit MockAsyncFileSystem(TimePoint current_time,
const io::IOContext& io_context = io::default_io_context())
: MockFileSystem(current_time, io_context) {
default_async_is_sync_ = false;
}
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
};
} // namespace arrow::fs::internal

View File

@@ -0,0 +1,178 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <optional>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "arrow/type_fwd.h"
namespace arrow {
namespace fs {
namespace internal {
constexpr char kSep = '/';
// Computations on abstract paths (not local paths with system-dependent behaviour).
// Abstract paths are typically used in URIs.
// Split an abstract path into its individual components.
ARROW_EXPORT
std::vector<std::string> SplitAbstractPath(const std::string& path, char sep = kSep);
// Slice the individual components of an abstract path and combine them
//
// If offset or length are negative then an empty string is returned
// If offset is >= the number of components then an empty string is returned
// If offset + length is >= the number of components then length is truncated
ARROW_EXPORT
std::string SliceAbstractPath(const std::string& path, int offset, int length,
char sep = kSep);
// Return the extension of the file
ARROW_EXPORT std::string GetAbstractPathExtension(const std::string& s);
// Return the depth (number of components) of an abstract path
//
// Trailing slashes do not count towards depth
// Leading slashes do not count towards depth
//
// The root path ("/") has depth 0
ARROW_EXPORT int GetAbstractPathDepth(std::string_view path);
// Return the parent directory and basename of an abstract path. Both values may be
// empty.
ARROW_EXPORT
std::pair<std::string, std::string> GetAbstractPathParent(const std::string& s);
// Validate an abstract path.
ARROW_EXPORT
Status ValidateAbstractPath(std::string_view path);
// Validate the components of an abstract path.
ARROW_EXPORT
Status ValidateAbstractPathParts(const std::vector<std::string>& parts);
// Append a non-empty stem to an abstract path.
ARROW_EXPORT
std::string ConcatAbstractPath(std::string_view base, std::string_view stem);
// Make path relative to base, if it starts with base. Otherwise error out.
ARROW_EXPORT
Result<std::string> MakeAbstractPathRelative(const std::string& base,
const std::string& path);
ARROW_EXPORT
std::string EnsureLeadingSlash(std::string_view s);
ARROW_EXPORT
std::string_view RemoveLeadingSlash(std::string_view s);
ARROW_EXPORT
std::string EnsureTrailingSlash(std::string_view s);
/// \brief remove the forward slash (if any) from the given path
/// \param s the input path
/// \param preserve_root if true, allow a path of just "/" to remain unchanged
ARROW_EXPORT
std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = false);
ARROW_EXPORT
Status AssertNoTrailingSlash(std::string_view s);
inline bool HasTrailingSlash(std::string_view s) {
return !s.empty() && s.back() == kSep;
}
inline bool HasLeadingSlash(std::string_view s) {
return !s.empty() && s.front() == kSep;
}
ARROW_EXPORT
bool IsAncestorOf(std::string_view ancestor, std::string_view descendant);
ARROW_EXPORT
std::optional<std::string_view> RemoveAncestor(std::string_view ancestor,
std::string_view descendant);
/// Return a vector of ancestors between a base path and a descendant.
/// For example,
///
/// AncestorsFromBasePath("a/b", "a/b/c/d/e") -> ["a/b/c", "a/b/c/d"]
ARROW_EXPORT
std::vector<std::string> AncestorsFromBasePath(std::string_view base_path,
std::string_view descendant);
/// Given a vector of paths of directories which must be created, produce a the minimal
/// subset for passing to CreateDir(recursive=true) by removing redundant parent
/// directories
ARROW_EXPORT
std::vector<std::string> MinimalCreateDirSet(std::vector<std::string> dirs);
// Join the components of an abstract path.
template <class StringIt>
std::string JoinAbstractPath(StringIt it, StringIt end, char sep = kSep) {
std::string path;
for (; it != end; ++it) {
if (it->empty()) continue;
if (!path.empty()) {
path += sep;
}
path += *it;
}
return path;
}
template <class StringRange>
std::string JoinAbstractPath(const StringRange& range, char sep = kSep) {
return JoinAbstractPath(range.begin(), range.end(), sep);
}
/// Convert slashes to backslashes, on all platforms. Mostly useful for testing.
ARROW_EXPORT
std::string ToBackslashes(std::string_view s);
/// Ensure a local path is abstract, by converting backslashes to regular slashes
/// on Windows. Return the path unchanged on other systems.
ARROW_EXPORT
std::string ToSlashes(std::string_view s);
ARROW_EXPORT
bool IsEmptyPath(std::string_view s);
ARROW_EXPORT
bool IsLikelyUri(std::string_view s);
class ARROW_EXPORT Globber {
public:
~Globber();
explicit Globber(std::string pattern);
bool Matches(const std::string& path);
protected:
struct Impl;
std::unique_ptr<Impl> impl_;
};
} // namespace internal
} // namespace fs
} // namespace arrow

View File

@@ -0,0 +1,108 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <gtest/gtest.h>
#include "arrow/filesystem/s3fs.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
namespace arrow {
namespace fs {
// A minio test server, managed as a child process
class MinioTestServer {
public:
MinioTestServer();
~MinioTestServer();
Status Start(bool enable_tls = false);
Status Stop();
std::string connect_string() const;
std::string access_key() const;
std::string secret_key() const;
std::string ca_dir_path() const;
std::string ca_file_path() const;
std::string scheme() const;
private:
Status GenerateCertificateFile();
struct Impl;
std::unique_ptr<Impl> impl_;
};
// A Minio "environment" that spawns Minio processes in advances, such as
// to hide process launch latencies during testing.
class MinioTestEnvironment : public ::testing::Environment {
public:
explicit MinioTestEnvironment(bool enable_tls = false);
~MinioTestEnvironment();
void SetUp() override;
Result<std::shared_ptr<MinioTestServer>> GetOneServer();
protected:
struct Impl;
std::unique_ptr<Impl> impl_;
};
// A global test "environment", to ensure that the S3 API is initialized before
// running unit tests.
class S3Environment : public ::testing::Environment {
public:
// We set this environment variable to speed up tests by ensuring
// DefaultAWSCredentialsProviderChain does not query (inaccessible)
// EC2 metadata endpoint.
// This must be done before spawning any Minio child process to avoid any race
// condition accessing environment variables.
S3Environment() : ec2_metadata_disabled_guard_("AWS_EC2_METADATA_DISABLED", "true") {}
void SetUp() override {
// Change this to increase logging during tests
S3GlobalOptions options;
options.log_level = S3LogLevel::Fatal;
ASSERT_OK(InitializeS3(options));
}
void TearDown() override { ASSERT_OK(FinalizeS3()); }
private:
EnvVarGuard ec2_metadata_disabled_guard_;
};
} // namespace fs
} // namespace arrow

View File

@@ -0,0 +1,467 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/util/macros.h"
#include "arrow/util/uri.h"
namespace Aws::Auth {
class AWSCredentialsProvider;
class STSAssumeRoleCredentialsProvider;
} // namespace Aws::Auth
namespace Aws::STS {
class STSClient;
} // namespace Aws::STS
namespace arrow::fs {
/// Options for using a proxy for S3
struct ARROW_EXPORT S3ProxyOptions {
std::string scheme;
std::string host;
int port = -1;
std::string username;
std::string password;
/// Initialize from URI such as http://username:password@host:port
/// or http://host:port
static Result<S3ProxyOptions> FromUri(const std::string& uri);
static Result<S3ProxyOptions> FromUri(const ::arrow::util::Uri& uri);
bool Equals(const S3ProxyOptions& other) const;
};
enum class S3CredentialsKind : int8_t {
/// Anonymous access (no credentials used)
Anonymous,
/// Use default AWS credentials, configured through environment variables
Default,
/// Use explicitly-provided access key pair
Explicit,
/// Assume role through a role ARN
Role,
/// Use web identity token to assume role, configured through environment variables
WebIdentity
};
/// Pure virtual class for describing custom S3 retry strategies
class ARROW_EXPORT S3RetryStrategy {
public:
virtual ~S3RetryStrategy() = default;
/// Simple struct where each field corresponds to a field in Aws::Client::AWSError
struct AWSErrorDetail {
/// Corresponds to AWSError::GetErrorType()
int error_type;
/// Corresponds to AWSError::GetMessage()
std::string message;
/// Corresponds to AWSError::GetExceptionName()
std::string exception_name;
/// Corresponds to AWSError::ShouldRetry()
bool should_retry;
};
/// Returns true if the S3 request resulting in the provided error should be retried.
virtual bool ShouldRetry(const AWSErrorDetail& error, int64_t attempted_retries) = 0;
/// Returns the time in milliseconds the S3 client should sleep for until retrying.
virtual int64_t CalculateDelayBeforeNextRetry(const AWSErrorDetail& error,
int64_t attempted_retries) = 0;
/// Returns a stock AWS Default retry strategy.
static std::shared_ptr<S3RetryStrategy> GetAwsDefaultRetryStrategy(
int64_t max_attempts);
/// Returns a stock AWS Standard retry strategy.
static std::shared_ptr<S3RetryStrategy> GetAwsStandardRetryStrategy(
int64_t max_attempts);
};
/// Options for the S3FileSystem implementation.
struct ARROW_EXPORT S3Options {
/// \brief Smart defaults for option values
///
/// The possible values for this setting are explained in the AWS docs:
/// https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html
std::string smart_defaults = "standard";
/// \brief AWS region to connect to.
///
/// If unset, the AWS SDK will choose a default value. The exact algorithm
/// depends on the SDK version. Before 1.8, the default is hardcoded
/// to "us-east-1". Since 1.8, several heuristics are used to determine
/// the region (environment variables, configuration profile, EC2 metadata
/// server).
std::string region;
/// \brief Socket connection timeout, in seconds
///
/// If negative, the AWS SDK default value is used (typically 1 second).
double connect_timeout = -1;
/// \brief Socket read timeout on Windows and macOS, in seconds
///
/// If negative, the AWS SDK default value is used (typically 3 seconds).
/// This option is ignored on non-Windows, non-macOS systems.
double request_timeout = -1;
/// If non-empty, override region with a connect string such as "localhost:9000"
// XXX perhaps instead take a URL like "http://localhost:9000"?
std::string endpoint_override;
/// S3 connection transport, default "https"
std::string scheme = "https";
/// ARN of role to assume
std::string role_arn;
/// Optional identifier for an assumed role session.
std::string session_name;
/// Optional external identifier to pass to STS when assuming a role
std::string external_id;
/// Frequency (in seconds) to refresh temporary credentials from assumed role
int load_frequency = 900;
/// If connection is through a proxy, set options here
S3ProxyOptions proxy_options;
/// AWS credentials provider
std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
/// Type of credentials being used. Set along with credentials_provider.
S3CredentialsKind credentials_kind = S3CredentialsKind::Default;
/// Whether to use virtual addressing of buckets
///
/// If true, then virtual addressing is always enabled.
/// If false, then virtual addressing is only enabled if `endpoint_override` is empty.
///
/// This can be used for non-AWS backends that only support virtual hosted-style access.
bool force_virtual_addressing = false;
/// Whether OutputStream writes will be issued in the background, without blocking.
bool background_writes = true;
/// Whether to allow creation of buckets
///
/// When S3FileSystem creates new buckets, it does not pass any non-default settings.
/// In AWS S3, the bucket and all objects will be not publicly visible, and there
/// will be no bucket policies and no resource tags. To have more control over how
/// buckets are created, use a different API to create them.
bool allow_bucket_creation = false;
/// Whether to allow deletion of buckets
bool allow_bucket_deletion = false;
/// Whether to allow pessimistic directory creation in CreateDir function
///
/// By default, CreateDir function will try to create the directory without checking its
/// existence. It's an optimization to try directory creation and catch the error,
/// rather than issue two dependent I/O calls.
/// Though for key/value storage like Google Cloud Storage, too many creation calls will
/// breach the rate limit for object mutation operations and cause serious consequences.
/// It's also possible you don't have creation access for the parent directory. Set it
/// to be true to address these scenarios.
bool check_directory_existence_before_creation = false;
/// Whether to allow file-open methods to return before the actual open.
///
/// Enabling this may reduce the latency of `OpenInputStream`, `OpenOutputStream`,
/// and similar methods, by reducing the number of roundtrips necessary. It may also
/// allow usage of more efficient S3 APIs for small files.
/// The downside is that failure conditions such as attempting to open a file in a
/// non-existing bucket will only be reported when actual I/O is done (at worse,
/// when attempting to close the file).
bool allow_delayed_open = false;
/// \brief Default metadata for OpenOutputStream.
///
/// This will be ignored if non-empty metadata is passed to OpenOutputStream.
std::shared_ptr<const KeyValueMetadata> default_metadata;
/// Optional retry strategy to determine which error types should be retried, and the
/// delay between retries.
std::shared_ptr<S3RetryStrategy> retry_strategy;
/// Optional customer-provided key for server-side encryption (SSE-C).
///
/// This should be the 32-byte AES-256 key, unencoded.
std::string sse_customer_key;
/// Optional path to a single PEM file holding all TLS CA certificates
///
/// If empty, global filesystem options will be used (see FileSystemGlobalOptions);
/// if the corresponding global filesystem option is also empty, the underlying
/// TLS library's defaults will be used.
///
/// Note this option may be ignored on some systems (Windows, macOS).
std::string tls_ca_file_path;
/// Optional path to a directory holding TLS CA
///
/// The given directory should contain CA certificates as individual PEM files
/// named along the OpenSSL "hashed" format.
///
/// If empty, global filesystem options will be used (see FileSystemGlobalOptions);
/// if the corresponding global filesystem option is also empty, the underlying
/// TLS library's defaults will be used.
///
/// Note this option may be ignored on some systems (Windows, macOS).
std::string tls_ca_dir_path;
/// Whether to verify the S3 endpoint's TLS certificate
///
/// This option applies if the scheme is "https".
bool tls_verify_certificates = true;
S3Options();
/// Configure with the default AWS credentials provider chain.
void ConfigureDefaultCredentials();
/// Configure with anonymous credentials. This will only let you access public buckets.
void ConfigureAnonymousCredentials();
/// Configure with explicit access and secret key.
void ConfigureAccessKey(const std::string& access_key, const std::string& secret_key,
const std::string& session_token = "");
/// Configure with credentials from an assumed role.
void ConfigureAssumeRoleCredentials(
const std::string& role_arn, const std::string& session_name = "",
const std::string& external_id = "", int load_frequency = 900,
const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
/// Configure with credentials from role assumed using a web identity token
void ConfigureAssumeRoleWithWebIdentityCredentials();
std::string GetAccessKey() const;
std::string GetSecretKey() const;
std::string GetSessionToken() const;
bool Equals(const S3Options& other) const;
/// \brief Initialize with default credentials provider chain
///
/// This is recommended if you use the standard AWS environment variables
/// and/or configuration file.
static S3Options Defaults();
/// \brief Initialize with anonymous credentials.
///
/// This will only let you access public buckets.
static S3Options Anonymous();
/// \brief Initialize with explicit access and secret key.
///
/// Optionally, a session token may also be provided for temporary credentials
/// (from STS).
static S3Options FromAccessKey(const std::string& access_key,
const std::string& secret_key,
const std::string& session_token = "");
/// \brief Initialize from an assumed role.
static S3Options FromAssumeRole(
const std::string& role_arn, const std::string& session_name = "",
const std::string& external_id = "", int load_frequency = 900,
const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
/// \brief Initialize from an assumed role with web-identity.
/// Uses the AWS SDK which uses environment variables to
/// generate temporary credentials.
static S3Options FromAssumeRoleWithWebIdentity();
static Result<S3Options> FromUri(const ::arrow::util::Uri& uri,
std::string* out_path = NULLPTR);
static Result<S3Options> FromUri(const std::string& uri,
std::string* out_path = NULLPTR);
};
/// S3-backed FileSystem implementation.
///
/// Some implementation notes:
/// - buckets are special and the operations available on them may be limited
/// or more expensive than desired.
class ARROW_EXPORT S3FileSystem : public FileSystem {
public:
~S3FileSystem() override;
std::string type_name() const override { return "s3"; }
/// Return the original S3 options when constructing the filesystem
S3Options options() const;
/// Return the actual region this filesystem connects to
std::string region() const;
bool Equals(const FileSystem& other) const override;
Result<std::string> PathFromUri(const std::string& uri_string) const override;
Result<std::string> MakeUri(std::string path) const override;
/// \cond FALSE
using FileSystem::CreateDir;
using FileSystem::DeleteDirContents;
using FileSystem::DeleteDirContentsAsync;
using FileSystem::GetFileInfo;
using FileSystem::OpenAppendStream;
using FileSystem::OpenOutputStream;
/// \endcond
Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
Status CreateDir(const std::string& path, bool recursive) override;
Status DeleteDir(const std::string& path) override;
Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok) override;
Status DeleteRootDirContents() override;
Status DeleteFile(const std::string& path) override;
Status Move(const std::string& src, const std::string& dest) override;
Status CopyFile(const std::string& src, const std::string& dest) override;
/// Create a sequential input stream for reading from a S3 object.
///
/// NOTE: Reads from the stream will be synchronous and unbuffered.
/// You way want to wrap the stream in a BufferedInputStream or use
/// a custom readahead strategy to avoid idle waits.
Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
/// Create a sequential input stream for reading from a S3 object.
///
/// This override avoids a HEAD request by assuming the FileInfo
/// contains correct information.
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
/// Create a random access file for reading from a S3 object.
///
/// See OpenInputStream for performance notes.
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
/// Create a random access file for reading from a S3 object.
///
/// This override avoids a HEAD request by assuming the FileInfo
/// contains correct information.
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;
/// Create a sequential output stream for writing to a S3 object.
///
/// NOTE: Writes to the stream will be buffered. Depending on
/// S3Options.background_writes, they can be synchronous or not.
/// It is recommended to enable background_writes unless you prefer
/// implementing your own background execution strategy.
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;
/// Create a S3FileSystem instance from the given options.
static Result<std::shared_ptr<S3FileSystem>> Make(
const S3Options& options, const io::IOContext& = io::default_io_context());
protected:
explicit S3FileSystem(const S3Options& options, const io::IOContext&);
class Impl;
std::shared_ptr<Impl> impl_;
};
enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace };
struct ARROW_EXPORT S3GlobalOptions {
/// The log level for S3-originating messages.
S3LogLevel log_level;
/// The number of threads to configure when creating AWS' I/O event loop
///
/// Defaults to 1 as recommended by AWS' doc when the # of connections is
/// expected to be, at most, in the hundreds
///
/// For more details see Aws::Crt::Io::EventLoopGroup
int num_event_loop_threads = 1;
/// Whether to install a process-wide SIGPIPE handler
///
/// The AWS SDK may sometimes emit SIGPIPE signals for certain errors;
/// by default, they would abort the current process.
/// This option, if enabled, will install a process-wide signal handler
/// that logs and otherwise ignore incoming SIGPIPE signals.
///
/// This option has no effect on Windows.
bool install_sigpipe_handler = false;
/// \brief Initialize with default options
///
/// For log_level, this method first tries to extract a suitable value from the
/// environment variable ARROW_S3_LOG_LEVEL.
static S3GlobalOptions Defaults();
};
/// \brief Initialize the S3 APIs with the specified set of options.
///
/// It is required to call this function at least once before using S3FileSystem.
///
/// Once this function is called you MUST call FinalizeS3 before the end of the
/// application in order to avoid a segmentation fault at shutdown.
ARROW_EXPORT
Status InitializeS3(const S3GlobalOptions& options);
/// \brief Ensure the S3 APIs are initialized, but only if not already done.
///
/// If necessary, this will call InitializeS3() with some default options.
ARROW_EXPORT
Status EnsureS3Initialized();
/// Whether S3 was initialized, and not finalized.
ARROW_EXPORT
bool IsS3Initialized();
/// Whether S3 was finalized.
ARROW_EXPORT
bool IsS3Finalized();
/// \brief Shutdown the S3 APIs.
///
/// This can wait for some S3 concurrent calls to finish so as to avoid
/// race conditions.
/// After this function has been called, all S3 calls will fail with an error.
///
/// Calls to InitializeS3() and FinalizeS3() should be serialized by the
/// application (this also applies to EnsureS3Initialized() and
/// EnsureS3Finalized()).
ARROW_EXPORT
Status FinalizeS3();
/// \brief Ensure the S3 APIs are shutdown, but only if not already done.
///
/// If necessary, this will call FinalizeS3().
ARROW_EXPORT
Status EnsureS3Finalized();
ARROW_EXPORT
Result<std::string> ResolveS3BucketRegion(const std::string& bucket);
} // namespace arrow::fs

View File

@@ -0,0 +1,264 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <chrono>
#include <memory>
#include <string>
#include <vector>
#include "arrow/filesystem/filesystem.h"
#include "arrow/filesystem/mockfs.h"
#include "arrow/testing/visibility.h"
#include "arrow/util/counting_semaphore_internal.h"
namespace arrow {
namespace fs {
static constexpr double kTimeSlack = 2.0; // In seconds
static inline FileInfo File(std::string path) {
return FileInfo(std::move(path), FileType::File);
}
static inline FileInfo Dir(std::string path) {
return FileInfo(std::move(path), FileType::Directory);
}
// A subclass of MockFileSystem that blocks operations until an unlock method is
// called.
//
// This is intended for testing fine-grained ordering of filesystem operations.
//
// N.B. Only OpenOutputStream supports gating at the moment but this is simply because
// it is all that has been needed so far. Feel free to add support for more methods
// as required.
class ARROW_TESTING_EXPORT GatedMockFilesystem : public internal::MockFileSystem {
public:
GatedMockFilesystem(TimePoint current_time,
const io::IOContext& = io::default_io_context());
~GatedMockFilesystem() override;
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
// Wait until at least num_waiters are waiting on OpenOutputStream
Status WaitForOpenOutputStream(uint32_t num_waiters);
// Unlock `num_waiters` individual calls to OpenOutputStream
Status UnlockOpenOutputStream(uint32_t num_waiters);
private:
util::CountingSemaphore open_output_sem_;
};
ARROW_TESTING_EXPORT
void CreateFile(FileSystem* fs, const std::string& path, const std::string& data);
// Sort a vector of FileInfo by lexicographic path order
ARROW_TESTING_EXPORT
void SortInfos(FileInfoVector* infos);
// Create a copy of a FileInfo vector sorted by lexicographic path order
ARROW_TESTING_EXPORT
FileInfoVector SortedInfos(const FileInfoVector& infos);
ARROW_TESTING_EXPORT
void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
TimePoint mtime);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
TimePoint mtime, int64_t size);
ARROW_TESTING_EXPORT
void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
int64_t size);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
TimePoint mtime);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
TimePoint mtime, int64_t size);
ARROW_TESTING_EXPORT
void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type, int64_t size);
ARROW_TESTING_EXPORT
void AssertFileContents(FileSystem* fs, const std::string& path,
const std::string& expected_data);
template <typename Duration>
void AssertDurationBetween(Duration d, double min_secs, double max_secs) {
auto seconds = std::chrono::duration_cast<std::chrono::duration<double>>(d);
ASSERT_GE(seconds.count(), min_secs);
ASSERT_LE(seconds.count(), max_secs);
}
// Generic tests for FileSystem implementations.
// To use this class, subclass both from it and ::testing::Test,
// implement GetEmptyFileSystem(), and use GENERIC_FS_TEST_FUNCTIONS()
// to define the various tests.
class ARROW_TESTING_EXPORT GenericFileSystemTest {
public:
virtual ~GenericFileSystemTest();
void TestEmpty();
void TestNormalizePath();
void TestCreateDir();
void TestDeleteDir();
void TestDeleteDirContents();
void TestDeleteRootDirContents();
void TestDeleteFile();
void TestDeleteFiles();
void TestMoveFile();
void TestMoveDir();
void TestCopyFile();
void TestCopyFiles();
void TestGetFileInfo();
void TestGetFileInfoVector();
void TestGetFileInfoSelector();
void TestGetFileInfoSelectorWithRecursion();
void TestGetFileInfoAsync();
void TestGetFileInfoGenerator();
void TestOpenOutputStream();
void TestOpenAppendStream();
void TestOpenInputStream();
void TestOpenInputStreamWithFileInfo();
void TestOpenInputStreamAsync();
void TestOpenInputFile();
void TestOpenInputFileWithFileInfo();
void TestOpenInputFileAsync();
void TestSpecialChars();
protected:
// This function should return the filesystem under test.
virtual std::shared_ptr<FileSystem> GetEmptyFileSystem() = 0;
// Override the following functions to specify deviations from expected
// filesystem semantics.
// - Whether the filesystem may "implicitly" create intermediate directories
virtual bool have_implicit_directories() const { return false; }
// - Whether the filesystem may allow writing a file "over" a directory
virtual bool allow_write_file_over_dir() const { return false; }
// - Whether the filesystem may allow writing a directory "over" a file,
// for example copying file "A" to "B/C" while "B" exists and is a file.
virtual bool allow_write_implicit_dir_over_file() const { return false; }
// - Whether the filesystem allows reading a directory
virtual bool allow_read_dir_as_file() const { return false; }
// - Whether the filesystem allows moving a file
virtual bool allow_move_file() const { return true; }
// - Whether the filesystem allows moving a directory
virtual bool allow_move_dir() const { return true; }
// - Whether the filesystem allows moving a directory "over" a non-empty destination
virtual bool allow_move_dir_over_non_empty_dir() const { return false; }
// - Whether the filesystem allows appending to a file
virtual bool allow_append_to_file() const { return true; }
// - Whether the filesystem allows appending to a nonexistent file
virtual bool allow_append_to_new_file() const { return true; }
// - Whether the filesystem supports directory modification times
virtual bool have_directory_mtimes() const { return true; }
// - Whether some directory tree deletion tests may fail randomly
virtual bool have_flaky_directory_tree_deletion() const { return false; }
// - Whether the filesystem stores some metadata alongside files
virtual bool have_file_metadata() const { return false; }
// - Whether the filesystem has a false positive memory leak with generator
virtual bool have_false_positive_memory_leak_with_generator() const { return false; }
// - Whether the filesystem has a false positive memory leak in async close
virtual bool have_false_positive_memory_leak_with_async_close() const { return false; }
void TestEmpty(FileSystem* fs);
void TestNormalizePath(FileSystem* fs);
void TestCreateDir(FileSystem* fs);
void TestDeleteDir(FileSystem* fs);
void TestDeleteDirContents(FileSystem* fs);
void TestDeleteRootDirContents(FileSystem* fs);
void TestDeleteFile(FileSystem* fs);
void TestDeleteFiles(FileSystem* fs);
void TestMoveFile(FileSystem* fs);
void TestMoveDir(FileSystem* fs);
void TestCopyFile(FileSystem* fs);
void TestCopyFiles(FileSystem* fs);
void TestGetFileInfo(FileSystem* fs);
void TestGetFileInfoVector(FileSystem* fs);
void TestGetFileInfoSelector(FileSystem* fs);
void TestGetFileInfoSelectorWithRecursion(FileSystem* fs);
void TestGetFileInfoAsync(FileSystem* fs);
void TestGetFileInfoGenerator(FileSystem* fs);
void TestOpenOutputStream(FileSystem* fs);
void TestOpenAppendStream(FileSystem* fs);
void TestOpenInputStream(FileSystem* fs);
void TestOpenInputStreamWithFileInfo(FileSystem* fs);
void TestOpenInputStreamAsync(FileSystem* fs);
void TestOpenInputFile(FileSystem* fs);
void TestOpenInputFileWithFileInfo(FileSystem* fs);
void TestOpenInputFileAsync(FileSystem* fs);
void TestSpecialChars(FileSystem* fs);
};
#define GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NAME) \
TEST_MACRO(TEST_CLASS, NAME) { this->Test##NAME(); }
#define GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_MACRO, TEST_CLASS) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, Empty) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NormalizePath) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CreateDir) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDir) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDirContents) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteRootDirContents) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFiles) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveDir) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFiles) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfo) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoVector) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelector) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelectorWithRecursion) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoAsync) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoGenerator) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenOutputStream) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenAppendStream) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStream) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamWithFileInfo) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamAsync) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFile) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileWithFileInfo) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileAsync) \
GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, SpecialChars)
#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \
GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_F, TEST_CLASS)
#define GENERIC_FS_TYPED_TEST_FUNCTIONS(TEST_CLASS) \
GENERIC_FS_TEST_FUNCTIONS_MACROS(TYPED_TEST, TEST_CLASS)
} // namespace fs
} // namespace arrow

View File

@@ -0,0 +1,53 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
namespace arrow {
namespace fs {
/// \brief FileSystem entry type
enum class FileType : int8_t {
/// Entry is not found
NotFound,
/// Entry exists but its type is unknown
///
/// This can designate a special file such as a Unix socket or character
/// device, or Windows NUL / CON / ...
Unknown,
/// Entry is a regular file
File,
/// Entry is a directory
Directory
};
struct FileInfo;
struct FileSelector;
class FileSystem;
class AzureFileSystem;
class GcsFileSystem;
class LocalFileSystem;
class S3FileSystem;
class SlowFileSystem;
class SubTreeFileSystem;
} // namespace fs
} // namespace arrow