Initial commit

2025-12-09 12:13:01 +01:00
commit 8e654ed209
13332 changed files with 2695056 additions and 0 deletions
--- a/venv/lib/python3.10/site-packages/pyarrow/fs.py
+++ b/venv/lib/python3.10/site-packages/pyarrow/fs.py
@@ -0,0 +1,428 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+FileSystem abstraction to interact with various local and remote filesystems.
+"""
+
+from pyarrow.util import _is_path_like, _stringify_path
+
+from pyarrow._fs import (  # noqa
+    FileSelector,
+    FileType,
+    FileInfo,
+    FileSystem,
+    LocalFileSystem,
+    SubTreeFileSystem,
+    _MockFileSystem,
+    FileSystemHandler,
+    PyFileSystem,
+    _copy_files,
+    _copy_files_selector,
+)
+
+# For backward compatibility.
+FileStats = FileInfo
+
+_not_imported = []
+try:
+    from pyarrow._azurefs import AzureFileSystem  # noqa
+except ImportError:
+    _not_imported.append("AzureFileSystem")
+
+try:
+    from pyarrow._hdfs import HadoopFileSystem  # noqa
+except ImportError:
+    _not_imported.append("HadoopFileSystem")
+
+try:
+    from pyarrow._gcsfs import GcsFileSystem  # noqa
+except ImportError:
+    _not_imported.append("GcsFileSystem")
+
+try:
+    from pyarrow._s3fs import (  # noqa
+        AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy,
+        S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized,
+        finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region)
+except ImportError:
+    _not_imported.append("S3FileSystem")
+else:
+    # GH-38364: we don't initialize S3 eagerly as that could lead
+    # to crashes at shutdown even when S3 isn't used.
+    # Instead, S3 is initialized lazily using `ensure_s3_initialized`
+    # in assorted places.
+    import atexit
+    atexit.register(ensure_s3_finalized)
+
+
+def __getattr__(name):
+    if name in _not_imported:
+        raise ImportError(
+            "The pyarrow installation is not built with support for "
+            f"'{name}'"
+        )
+
+    raise AttributeError(
+        f"module 'pyarrow.fs' has no attribute '{name}'"
+    )
+
+
+def _ensure_filesystem(filesystem, *, use_mmap=False):
+    if isinstance(filesystem, FileSystem):
+        return filesystem
+    elif isinstance(filesystem, str):
+        # create a filesystem from a URI string, note that the `path` part of the URI
+        # is treated as a prefix if specified, so the filesystem is wrapped in a
+        # SubTreeFileSystem
+        if use_mmap:
+            raise ValueError(
+                "Specifying to use memory mapping not supported for "
+                "filesystem specified as an URI string"
+            )
+        fs, path = FileSystem.from_uri(filesystem)
+        prefix = fs.normalize_path(path)
+        if prefix:
+            # validate that the prefix is pointing to a directory
+            prefix_info = fs.get_file_info([prefix])[0]
+            if prefix_info.type != FileType.Directory:
+                raise ValueError(
+                    "The path component of the filesystem URI must point to a "
+                    f"directory but it has a type: `{prefix_info.type.name}`. The path "
+                    f"component is `{prefix_info.path}` and the given filesystem URI "
+                    f"is `{filesystem}`"
+                )
+            fs = SubTreeFileSystem(prefix, fs)
+        return fs
+    else:
+        # handle fsspec-compatible filesystems
+        try:
+            import fsspec
+        except ImportError:
+            pass
+        else:
+            if isinstance(filesystem, fsspec.AbstractFileSystem):
+                if type(filesystem).__name__ == 'LocalFileSystem':
+                    # In case its a simple LocalFileSystem, use native arrow one
+                    return LocalFileSystem(use_mmap=use_mmap)
+                return PyFileSystem(FSSpecHandler(filesystem))
+
+        raise TypeError(
+            f"Unrecognized filesystem: {type(filesystem)}. `filesystem` argument must "
+            "be a FileSystem instance or a valid file system URI"
+        )
+
+
+def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False):
+    """
+    Return filesystem/path from path which could be an URI or a plain
+    filesystem path or a combination of fsspec protocol and URI.
+    """
+    if not _is_path_like(path):
+        if filesystem is not None:
+            raise ValueError(
+                "'filesystem' passed but the specified path is file-like, so"
+                " there is nothing to open with 'filesystem'."
+            )
+        return filesystem, path
+
+    if filesystem is not None:
+        filesystem = _ensure_filesystem(filesystem, use_mmap=memory_map)
+        if isinstance(filesystem, LocalFileSystem):
+            path = _stringify_path(path)
+        elif not isinstance(path, str):
+            raise TypeError(
+                "Expected string path; path-like objects are only allowed "
+                "with a local filesystem"
+            )
+        path = filesystem.normalize_path(path)
+        return filesystem, path
+
+    path = _stringify_path(path)
+
+    # if filesystem is not given, try to automatically determine one
+    # first check if the file exists as a local (relative) file path
+    # if not then try to parse the path as an URI
+    filesystem = LocalFileSystem(use_mmap=memory_map)
+
+    try:
+        file_info = filesystem.get_file_info(path)
+    except ValueError:  # ValueError means path is likely an URI
+        file_info = None
+        exists_locally = False
+    else:
+        exists_locally = (file_info.type != FileType.NotFound)
+
+    # if the file or directory doesn't exists locally, then assume that
+    # the path is an URI describing the file system as well
+    if not exists_locally:
+        try:
+            filesystem, path = FileSystem.from_uri(path)
+        except ValueError as e:
+            msg = str(e)
+            if "empty scheme" in msg or "Cannot parse URI" in msg:
+                # neither an URI nor a locally existing path, so assume that
+                # local path was given and propagate a nicer file not found
+                # error instead of a more confusing scheme parsing error
+                pass
+            else:
+                raise e
+    else:
+        path = filesystem.normalize_path(path)
+
+    return filesystem, path
+
+
+def copy_files(source, destination,
+               source_filesystem=None, destination_filesystem=None,
+               *, chunk_size=1024*1024, use_threads=True):
+    """
+    Copy files between FileSystems.
+
+    This functions allows you to recursively copy directories of files from
+    one file system to another, such as from S3 to your local machine.
+
+    Parameters
+    ----------
+    source : string
+        Source file path or URI to a single file or directory.
+        If a directory, files will be copied recursively from this path.
+    destination : string
+        Destination file path or URI. If `source` is a file, `destination`
+        is also interpreted as the destination file (not directory).
+        Directories will be created as necessary.
+    source_filesystem : FileSystem, optional
+        Source filesystem, needs to be specified if `source` is not a URI,
+        otherwise inferred.
+    destination_filesystem : FileSystem, optional
+        Destination filesystem, needs to be specified if `destination` is not
+        a URI, otherwise inferred.
+    chunk_size : int, default 1MB
+        The maximum size of block to read before flushing to the
+        destination file. A larger chunk_size will use more memory while
+        copying but may help accommodate high latency FileSystems.
+    use_threads : bool, default True
+        Whether to use multiple threads to accelerate copying.
+
+    Examples
+    --------
+    Inspect an S3 bucket's files:
+
+    >>> s3, path = fs.FileSystem.from_uri(
+    ...            "s3://registry.opendata.aws/roda/ndjson/")
+    >>> selector = fs.FileSelector(path)
+    >>> s3.get_file_info(selector)
+    [<FileInfo for 'registry.opendata.aws/roda/ndjson/index.ndjson':...]
+
+    Copy one file from S3 bucket to a local directory:
+
+    >>> fs.copy_files("s3://registry.opendata.aws/roda/ndjson/index.ndjson",
+    ...               f"file:///{local_path}/index_copy.ndjson")
+
+    >>> fs.LocalFileSystem().get_file_info(str(local_path)+
+    ...                                    '/index_copy.ndjson')
+    <FileInfo for '.../index_copy.ndjson': type=FileType.File, size=...>
+
+    Copy file using a FileSystem object:
+
+    >>> fs.copy_files("registry.opendata.aws/roda/ndjson/index.ndjson",
+    ...               f"file:///{local_path}/index_copy.ndjson",
+    ...               source_filesystem=fs.S3FileSystem())
+    """
+    source_fs, source_path = _resolve_filesystem_and_path(
+        source, source_filesystem
+    )
+    destination_fs, destination_path = _resolve_filesystem_and_path(
+        destination, destination_filesystem
+    )
+
+    file_info = source_fs.get_file_info(source_path)
+    if file_info.type == FileType.Directory:
+        source_sel = FileSelector(source_path, recursive=True)
+        _copy_files_selector(source_fs, source_sel,
+                             destination_fs, destination_path,
+                             chunk_size, use_threads)
+    else:
+        _copy_files(source_fs, source_path,
+                    destination_fs, destination_path,
+                    chunk_size, use_threads)
+
+
+class FSSpecHandler(FileSystemHandler):
+    """
+    Handler for fsspec-based Python filesystems.
+
+    https://filesystem-spec.readthedocs.io/en/latest/index.html
+
+    Parameters
+    ----------
+    fs : FSSpec-compliant filesystem instance
+
+    Examples
+    --------
+    >>> PyFileSystem(FSSpecHandler(fsspec_fs)) # doctest: +SKIP
+    """
+
+    def __init__(self, fs):
+        self.fs = fs
+
+    def __eq__(self, other):
+        if isinstance(other, FSSpecHandler):
+            return self.fs == other.fs
+        return NotImplemented
+
+    def __ne__(self, other):
+        if isinstance(other, FSSpecHandler):
+            return self.fs != other.fs
+        return NotImplemented
+
+    def get_type_name(self):
+        protocol = self.fs.protocol
+        if isinstance(protocol, list):
+            protocol = protocol[0]
+        return f"fsspec+{protocol}"
+
+    def normalize_path(self, path):
+        return path
+
+    @staticmethod
+    def _create_file_info(path, info):
+        size = info["size"]
+        if info["type"] == "file":
+            ftype = FileType.File
+        elif info["type"] == "directory":
+            ftype = FileType.Directory
+            # some fsspec filesystems include a file size for directories
+            size = None
+        else:
+            ftype = FileType.Unknown
+        return FileInfo(path, ftype, size=size, mtime=info.get("mtime", None))
+
+    def get_file_info(self, paths):
+        infos = []
+        for path in paths:
+            try:
+                info = self.fs.info(path)
+            except FileNotFoundError:
+                infos.append(FileInfo(path, FileType.NotFound))
+            else:
+                infos.append(self._create_file_info(path, info))
+        return infos
+
+    def get_file_info_selector(self, selector):
+        if not self.fs.isdir(selector.base_dir):
+            if self.fs.exists(selector.base_dir):
+                raise NotADirectoryError(selector.base_dir)
+            else:
+                if selector.allow_not_found:
+                    return []
+                else:
+                    raise FileNotFoundError(selector.base_dir)
+
+        if selector.recursive:
+            maxdepth = None
+        else:
+            maxdepth = 1
+
+        infos = []
+        selected_files = self.fs.find(
+            selector.base_dir, maxdepth=maxdepth, withdirs=True, detail=True
+        )
+        for path, info in selected_files.items():
+            _path = path.strip("/")
+            base_dir = selector.base_dir.strip("/")
+            # Need to exclude base directory from selected files if present
+            # (fsspec filesystems, see GH-37555)
+            if _path != base_dir:
+                infos.append(self._create_file_info(path, info))
+
+        return infos
+
+    def create_dir(self, path, recursive):
+        # mkdir also raises FileNotFoundError when base directory is not found
+        try:
+            self.fs.mkdir(path, create_parents=recursive)
+        except FileExistsError:
+            pass
+
+    def delete_dir(self, path):
+        self.fs.rm(path, recursive=True)
+
+    def _delete_dir_contents(self, path, missing_dir_ok):
+        try:
+            subpaths = self.fs.listdir(path, detail=False)
+        except FileNotFoundError:
+            if missing_dir_ok:
+                return
+            raise
+        for subpath in subpaths:
+            if self.fs.isdir(subpath):
+                self.fs.rm(subpath, recursive=True)
+            elif self.fs.isfile(subpath):
+                self.fs.rm(subpath)
+
+    def delete_dir_contents(self, path, missing_dir_ok):
+        if path.strip("/") == "":
+            raise ValueError(
+                "delete_dir_contents called on path '", path, "'")
+        self._delete_dir_contents(path, missing_dir_ok)
+
+    def delete_root_dir_contents(self):
+        self._delete_dir_contents("/")
+
+    def delete_file(self, path):
+        # fs.rm correctly raises IsADirectoryError when `path` is a directory
+        # instead of a file and `recursive` is not set to True
+        if not self.fs.exists(path):
+            raise FileNotFoundError(path)
+        self.fs.rm(path)
+
+    def move(self, src, dest):
+        self.fs.mv(src, dest, recursive=True)
+
+    def copy_file(self, src, dest):
+        # fs.copy correctly raises IsADirectoryError when `src` is a directory
+        # instead of a file
+        self.fs.copy(src, dest)
+
+    # TODO can we read/pass metadata (e.g. Content-Type) in the methods below?
+
+    def open_input_stream(self, path):
+        from pyarrow import PythonFile
+
+        if not self.fs.isfile(path):
+            raise FileNotFoundError(path)
+
+        return PythonFile(self.fs.open(path, mode="rb"), mode="r")
+
+    def open_input_file(self, path):
+        from pyarrow import PythonFile
+
+        if not self.fs.isfile(path):
+            raise FileNotFoundError(path)
+
+        return PythonFile(self.fs.open(path, mode="rb"), mode="r")
+
+    def open_output_stream(self, path, metadata):
+        from pyarrow import PythonFile
+
+        return PythonFile(self.fs.open(path, mode="wb"), mode="w")
+
+    def open_append_stream(self, path, metadata):
+        from pyarrow import PythonFile
+
+        return PythonFile(self.fs.open(path, mode="ab"), mode="w")