Videre
This commit is contained in:
@@ -0,0 +1,69 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
# pylint: disable=missing-docstring,import-outside-toplevel,import-self
|
||||
#
|
||||
# Import functions/classes to make the API
|
||||
from .core import Pooch, create, retrieve
|
||||
from .utils import os_cache, check_version, get_logger
|
||||
from .hashes import file_hash, make_registry
|
||||
from .downloaders import (
|
||||
HTTPDownloader,
|
||||
FTPDownloader,
|
||||
SFTPDownloader,
|
||||
DOIDownloader,
|
||||
)
|
||||
from .processors import Unzip, Untar, Decompress
|
||||
|
||||
# This file is generated automatically by setuptools_scm
|
||||
from . import _version # type: ignore
|
||||
|
||||
|
||||
# Add a "v" to the version number
|
||||
__version__ = f"v{_version.version}"
|
||||
|
||||
|
||||
def test(doctest=True, verbose=True, coverage=False):
|
||||
"""
|
||||
Run the test suite.
|
||||
|
||||
Uses `py.test <http://pytest.org/>`__ to discover and run the tests.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
doctest : bool
|
||||
If ``True``, will run the doctests as well (code examples that start
|
||||
with a ``>>>`` in the docs).
|
||||
verbose : bool
|
||||
If ``True``, will print extra information during the test run.
|
||||
coverage : bool
|
||||
If ``True``, will run test coverage analysis on the code as well.
|
||||
Requires ``pytest-cov``.
|
||||
|
||||
Raises
|
||||
------
|
||||
|
||||
AssertionError
|
||||
If pytest returns a non-zero error code indicating that some tests have
|
||||
failed.
|
||||
|
||||
"""
|
||||
import pytest
|
||||
|
||||
package = __name__
|
||||
args = []
|
||||
if verbose:
|
||||
args.append("-vv")
|
||||
if coverage:
|
||||
args.append(f"--cov={package}")
|
||||
args.append("--cov-report=term-missing")
|
||||
if doctest:
|
||||
args.append("--doctest-modules")
|
||||
args.append("--pyargs")
|
||||
args.append(package)
|
||||
status = pytest.main(args)
|
||||
assert status == 0, "Some tests have failed."
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,34 @@
|
||||
# file generated by setuptools-scm
|
||||
# don't change, don't track in version control
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__version_tuple__",
|
||||
"version",
|
||||
"version_tuple",
|
||||
"__commit_id__",
|
||||
"commit_id",
|
||||
]
|
||||
|
||||
TYPE_CHECKING = False
|
||||
if TYPE_CHECKING:
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
||||
COMMIT_ID = Union[str, None]
|
||||
else:
|
||||
VERSION_TUPLE = object
|
||||
COMMIT_ID = object
|
||||
|
||||
version: str
|
||||
__version__: str
|
||||
__version_tuple__: VERSION_TUPLE
|
||||
version_tuple: VERSION_TUPLE
|
||||
commit_id: COMMIT_ID
|
||||
__commit_id__: COMMIT_ID
|
||||
|
||||
__version__ = version = '1.9.0'
|
||||
__version_tuple__ = version_tuple = (1, 9, 0)
|
||||
|
||||
__commit_id__ = commit_id = None
|
||||
838
linedance-app/venv/lib/python3.12/site-packages/pooch/core.py
Normal file
838
linedance-app/venv/lib/python3.12/site-packages/pooch/core.py
Normal file
@@ -0,0 +1,838 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
The main Pooch class and a factory function for it.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import contextlib
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import shutil
|
||||
from typing import Union, Optional, Any
|
||||
|
||||
|
||||
from .hashes import hash_matches, file_hash
|
||||
from .utils import (
|
||||
check_version,
|
||||
get_logger,
|
||||
make_local_storage,
|
||||
cache_location,
|
||||
temporary_file,
|
||||
os_cache,
|
||||
unique_file_name,
|
||||
)
|
||||
from .downloaders import DOIDownloader, choose_downloader, doi_to_repository
|
||||
from .typing import PathType, PathInputType, Processor, Downloader, Action
|
||||
|
||||
|
||||
def retrieve(
|
||||
url: str,
|
||||
known_hash: Optional[str] = None,
|
||||
fname: Optional[str] = None,
|
||||
path: Optional[PathType] = None,
|
||||
processor: Optional[Processor] = None,
|
||||
downloader: Optional[Downloader] = None,
|
||||
progressbar: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Download and cache a single file locally.
|
||||
|
||||
Uses HTTP or FTP by default, depending on the protocol in the given *url*.
|
||||
Other download methods can be controlled through the *downloader* argument
|
||||
(see below).
|
||||
|
||||
The file will be downloaded to a temporary location first and its hash will
|
||||
be compared to the given *known_hash*. This is done to ensure that the
|
||||
download happened correctly and securely. If the hash doesn't match, the
|
||||
file will be deleted and an exception will be raised.
|
||||
|
||||
If the file already exists locally, its hash will be compared to
|
||||
*known_hash*. If they are not the same, this is interpreted as the file
|
||||
needing to be updated and it will be downloaded again.
|
||||
|
||||
You can bypass these checks by passing ``known_hash=None``. If this is
|
||||
done, the SHA256 hash of the downloaded file will be logged to the screen.
|
||||
It is highly recommended that you copy and paste this hash as *known_hash*
|
||||
so that future downloads are guaranteed to be the exact same file. This is
|
||||
crucial for reproducible computations.
|
||||
|
||||
If the file exists in the given *path* with the given *fname* and the hash
|
||||
matches, it will not be downloaded and the absolute path to the file will
|
||||
be returned.
|
||||
|
||||
.. note::
|
||||
|
||||
This function is meant for downloading single files. If you need to
|
||||
manage the download and caching of several files, with versioning, use
|
||||
:func:`pooch.create` and :class:`pooch.Pooch` instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
The URL to the file that is to be downloaded. Ideally, the URL should
|
||||
end in a file name.
|
||||
known_hash : str or None
|
||||
A known hash (checksum) of the file. Will be used to verify the
|
||||
download or check if an existing file needs to be updated. By default,
|
||||
will assume it's a SHA256 hash. To specify a different hashing method,
|
||||
prepend the hash with ``algorithm:``, for example
|
||||
``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``. If
|
||||
None, will NOT check the hash of the downloaded file or check if an
|
||||
existing file needs to be updated.
|
||||
fname : str or None
|
||||
The name that will be used to save the file. Should NOT include the
|
||||
full path, just the file name (it will be appended to *path*). If
|
||||
None, will create a unique file name using a combination of the last
|
||||
part of the URL (assuming it's the file name) and the MD5 hash of the
|
||||
URL. For example, ``81whdo2d2e928yd1wi22-data-file.csv``. This ensures
|
||||
that files from different URLs never overwrite each other, even if they
|
||||
have the same name.
|
||||
path : str or PathLike or None
|
||||
The location of the cache folder on disk. This is where the file will
|
||||
be saved. If None, will save to a ``pooch`` folder in the default cache
|
||||
location for your operating system (see :func:`pooch.os_cache`).
|
||||
processor : None or callable
|
||||
If not None, then a function (or callable object) that will be called
|
||||
before returning the full path and after the file has been downloaded
|
||||
(if required). See :ref:`processors` for details.
|
||||
downloader : None or callable
|
||||
If not None, then a function (or callable object) that will be called
|
||||
to download a given URL to a provided local file name. See
|
||||
:ref:`downloaders` for details.
|
||||
progressbar : bool or an arbitrary progress bar object
|
||||
If True, will print a progress bar of the download to standard error
|
||||
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
|
||||
installed. Alternatively, an arbitrary progress bar object can be
|
||||
passed. See :ref:`custom-progressbar` for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
full_path : str
|
||||
The absolute path (including the file name) of the file in the local
|
||||
storage.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
Download one of the data files from the Pooch repository on GitHub:
|
||||
|
||||
>>> import os
|
||||
>>> from pooch import __version__, check_version, retrieve
|
||||
>>> # Make a URL for the version of pooch we have installed
|
||||
>>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
|
||||
>>> url = url.format(check_version(__version__, fallback="main"))
|
||||
>>> # Download the file and save it locally. Will check the MD5 checksum of
|
||||
>>> # the downloaded file against the given value to make sure it's the
|
||||
>>> # right file. You can use other hashes by specifying different
|
||||
>>> # algorithm names (sha256, sha1, etc).
|
||||
>>> fname = retrieve(
|
||||
... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
|
||||
... )
|
||||
>>> with open(fname) as f:
|
||||
... print(f.read().strip())
|
||||
# A tiny data file for test purposes only
|
||||
1 2 3 4 5 6
|
||||
>>> # Running again won't trigger a download and only return the path to
|
||||
>>> # the existing file.
|
||||
>>> fname2 = retrieve(
|
||||
... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
|
||||
... )
|
||||
>>> print(fname2 == fname)
|
||||
True
|
||||
>>> os.remove(fname)
|
||||
|
||||
Files that are compressed with gzip, xz/lzma, or bzip2 can be automatically
|
||||
decompressed by passing using the :class:`pooch.Decompress` processor:
|
||||
|
||||
>>> from pooch import Decompress
|
||||
>>> # URLs to a gzip compressed version of the data file.
|
||||
>>> url = ("https://github.com/fatiando/pooch/raw/{}/"
|
||||
... + "pooch/tests/data/tiny-data.txt.gz")
|
||||
>>> url = url.format(check_version(__version__, fallback="main"))
|
||||
>>> # By default, you would have to decompress the file yourself
|
||||
>>> fname = retrieve(
|
||||
... url,
|
||||
... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
|
||||
... )
|
||||
>>> print(os.path.splitext(fname)[1])
|
||||
.gz
|
||||
>>> # Use the processor to decompress after download automatically and
|
||||
>>> # return the path to the decompressed file instead.
|
||||
>>> fname2 = retrieve(
|
||||
... url,
|
||||
... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
|
||||
... processor=Decompress(),
|
||||
... )
|
||||
>>> print(fname2 == fname)
|
||||
False
|
||||
>>> with open(fname2) as f:
|
||||
... print(f.read().strip())
|
||||
# A tiny data file for test purposes only
|
||||
1 2 3 4 5 6
|
||||
>>> os.remove(fname)
|
||||
>>> os.remove(fname2)
|
||||
|
||||
When downloading archives (zip or tar), it can be useful to unpack them
|
||||
after download to avoid having to do that yourself. Use the processors
|
||||
:class:`pooch.Unzip` or :class:`pooch.Untar` to do this automatically:
|
||||
|
||||
>>> from pooch import Unzip
|
||||
>>> # URLs to a zip archive with a single data file.
|
||||
>>> url = ("https://github.com/fatiando/pooch/raw/{}/"
|
||||
... + "pooch/tests/data/tiny-data.zip")
|
||||
>>> url = url.format(check_version(__version__, fallback="main"))
|
||||
>>> # By default, you would get the path to the archive
|
||||
>>> fname = retrieve(
|
||||
... url,
|
||||
... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
|
||||
... )
|
||||
>>> print(os.path.splitext(fname)[1])
|
||||
.zip
|
||||
>>> os.remove(fname)
|
||||
>>> # Using the processor, the archive will be unzipped and a list with the
|
||||
>>> # path to every file will be returned instead of a single path.
|
||||
>>> fnames = retrieve(
|
||||
... url,
|
||||
... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
|
||||
... processor=Unzip(),
|
||||
... )
|
||||
>>> # There was only a single file in our archive.
|
||||
>>> print(len(fnames))
|
||||
1
|
||||
>>> with open(fnames[0]) as f:
|
||||
... print(f.read().strip())
|
||||
# A tiny data file for test purposes only
|
||||
1 2 3 4 5 6
|
||||
>>> for f in fnames:
|
||||
... os.remove(f)
|
||||
|
||||
|
||||
"""
|
||||
if path is None:
|
||||
path = os_cache("pooch")
|
||||
if fname is None:
|
||||
fname = unique_file_name(url)
|
||||
# Make the path absolute.
|
||||
path = cache_location(path, env=None, version=None)
|
||||
|
||||
full_path = path.resolve() / fname
|
||||
action, verb = download_action(full_path, known_hash)
|
||||
|
||||
if action in ("download", "update"):
|
||||
# We need to write data, so create the local data directory if it
|
||||
# doesn't already exist.
|
||||
make_local_storage(path)
|
||||
|
||||
get_logger().info(
|
||||
"%s data from '%s' to file '%s'.",
|
||||
verb,
|
||||
url,
|
||||
str(full_path),
|
||||
)
|
||||
|
||||
if downloader is None:
|
||||
downloader = choose_downloader(url, progressbar=progressbar)
|
||||
|
||||
stream_download(url, full_path, known_hash, downloader, pooch=None)
|
||||
|
||||
if known_hash is None:
|
||||
get_logger().info(
|
||||
"SHA256 hash of downloaded file: %s\n"
|
||||
"Use this value as the 'known_hash' argument of 'pooch.retrieve'"
|
||||
" to ensure that the file hasn't changed if it is downloaded again"
|
||||
" in the future.",
|
||||
file_hash(str(full_path)),
|
||||
)
|
||||
|
||||
if processor is not None:
|
||||
return processor(str(full_path), action, None)
|
||||
|
||||
return str(full_path)
|
||||
|
||||
|
||||
def create(
|
||||
path: PathInputType,
|
||||
base_url: str,
|
||||
version: Optional[str] = None,
|
||||
version_dev: str = "master",
|
||||
env: Optional[str] = None,
|
||||
registry: Optional[dict] = None,
|
||||
urls: Optional[dict] = None,
|
||||
retry_if_failed: int = 0,
|
||||
allow_updates: Union[bool, str] = True,
|
||||
):
|
||||
"""
|
||||
Create a :class:`~pooch.Pooch` with sensible defaults to fetch data files.
|
||||
|
||||
If a version string is given, the Pooch will be versioned, meaning that the
|
||||
local storage folder and the base URL depend on the project version. This
|
||||
is necessary if your users have multiple versions of your library installed
|
||||
(using virtual environments) and you updated the data files between
|
||||
versions. Otherwise, every time a user switches environments would trigger
|
||||
a re-download of the data. The version string will be appended to the local
|
||||
storage path (for example, ``~/.mypooch/cache/v0.1``) and inserted into the
|
||||
base URL (for example,
|
||||
``https://github.com/fatiando/pooch/raw/v0.1/data``). If the version string
|
||||
contains ``+XX.XXXXX``, it will be interpreted as a development version.
|
||||
|
||||
Does **not** create the local data storage folder. The folder will only be
|
||||
created the first time a download is attempted with
|
||||
:meth:`pooch.Pooch.fetch`. This makes it safe to use this function at the
|
||||
module level (so it's executed on ``import`` and the resulting
|
||||
:class:`~pooch.Pooch` is a global variable).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, PathLike, list or tuple
|
||||
The path to the local data storage folder. If this is a list or tuple,
|
||||
we'll join the parts with the appropriate separator. The *version* will
|
||||
be appended to the end of this path. Use :func:`pooch.os_cache` for a
|
||||
sensible default.
|
||||
base_url : str
|
||||
Base URL for the remote data source. All requests will be made relative
|
||||
to this URL. The string should have a ``{version}`` formatting mark in
|
||||
it. We will call ``.format(version=version)`` on this string. If the
|
||||
URL does not end in a ``'/'``, a trailing ``'/'`` will be added
|
||||
automatically.
|
||||
version : str or None
|
||||
The version string for your project. Should be PEP440 compatible. If
|
||||
None is given, will not attempt to format *base_url* and no subfolder
|
||||
will be appended to *path*.
|
||||
version_dev : str
|
||||
The name used for the development version of a project. If your data is
|
||||
hosted on Github (and *base_url* is a Github raw link), then
|
||||
``"master"`` is a good choice (default). Ignored if *version* is None.
|
||||
env : str or None
|
||||
An environment variable that can be used to overwrite *path*. This
|
||||
allows users to control where they want the data to be stored. We'll
|
||||
append *version* to the end of this value as well.
|
||||
registry : dict or None
|
||||
A record of the files that are managed by this Pooch. Keys should be
|
||||
the file names and the values should be their hashes. Only files
|
||||
in the registry can be fetched from the local storage. Files in
|
||||
subdirectories of *path* **must use Unix-style separators** (``'/'``)
|
||||
even on Windows.
|
||||
urls : dict or None
|
||||
Custom URLs for downloading individual files in the registry. A
|
||||
dictionary with the file names as keys and the custom URLs as values.
|
||||
Not all files in *registry* need an entry in *urls*. If a file has an
|
||||
entry in *urls*, the *base_url* will be ignored when downloading it in
|
||||
favor of ``urls[fname]``.
|
||||
retry_if_failed : int
|
||||
Retry a file download the specified number of times if it fails because
|
||||
of a bad connection or a hash mismatch. By default, downloads are only
|
||||
attempted once (``retry_if_failed=0``). Initially, will wait for 1s
|
||||
between retries and then increase the wait time by 1s with each retry
|
||||
until a maximum of 10s.
|
||||
allow_updates : bool or str
|
||||
Whether existing files in local storage that have a hash mismatch with
|
||||
the registry are allowed to update from the remote URL. If a string is
|
||||
passed, we will assume it's the name of an environment variable that
|
||||
will be checked for the true/false value. If ``False``, any mismatch
|
||||
with hashes in the registry will result in an error. Defaults to
|
||||
``True``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pooch : :class:`~pooch.Pooch`
|
||||
The :class:`~pooch.Pooch` initialized with the given arguments.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
Create a :class:`~pooch.Pooch` for a release (v0.1):
|
||||
|
||||
>>> pup = create(path="myproject",
|
||||
... base_url="http://some.link.com/{version}/",
|
||||
... version="v0.1",
|
||||
... registry={"data.txt": "9081wo2eb2gc0u..."})
|
||||
>>> print(pup.path.parts) # The path is a pathlib.Path
|
||||
('myproject', 'v0.1')
|
||||
>>> # The local folder is only created when a dataset is first downloaded
|
||||
>>> print(pup.path.exists())
|
||||
False
|
||||
>>> print(pup.base_url)
|
||||
http://some.link.com/v0.1/
|
||||
>>> print(pup.registry)
|
||||
{'data.txt': '9081wo2eb2gc0u...'}
|
||||
>>> print(pup.registry_files)
|
||||
['data.txt']
|
||||
|
||||
If this is a development version (12 commits ahead of v0.1), then the
|
||||
``version_dev`` will be used (defaults to ``"master"``):
|
||||
|
||||
>>> pup = create(path="myproject",
|
||||
... base_url="http://some.link.com/{version}/",
|
||||
... version="v0.1+12.do9iwd")
|
||||
>>> print(pup.path.parts)
|
||||
('myproject', 'master')
|
||||
>>> print(pup.base_url)
|
||||
http://some.link.com/master/
|
||||
|
||||
Versioning is optional (but highly encouraged):
|
||||
|
||||
>>> pup = create(path="myproject",
|
||||
... base_url="http://some.link.com/",
|
||||
... registry={"data.txt": "9081wo2eb2gc0u..."})
|
||||
>>> print(pup.path.parts) # The path is a pathlib.Path
|
||||
('myproject',)
|
||||
>>> print(pup.base_url)
|
||||
http://some.link.com/
|
||||
|
||||
To place the storage folder at a subdirectory, pass in a list and we'll
|
||||
join the path for you using the appropriate separator for your operating
|
||||
system:
|
||||
|
||||
>>> pup = create(path=["myproject", "cache", "data"],
|
||||
... base_url="http://some.link.com/{version}/",
|
||||
... version="v0.1")
|
||||
>>> print(pup.path.parts)
|
||||
('myproject', 'cache', 'data', 'v0.1')
|
||||
|
||||
The user can overwrite the storage path by setting an environment variable:
|
||||
|
||||
>>> # The variable is not set so we'll use *path*
|
||||
>>> pup = create(path=["myproject", "not_from_env"],
|
||||
... base_url="http://some.link.com/{version}/",
|
||||
... version="v0.1",
|
||||
... env="MYPROJECT_DATA_DIR")
|
||||
>>> print(pup.path.parts)
|
||||
('myproject', 'not_from_env', 'v0.1')
|
||||
>>> # Set the environment variable and try again
|
||||
>>> import os
|
||||
>>> os.environ["MYPROJECT_DATA_DIR"] = os.path.join("myproject", "env")
|
||||
>>> pup = create(path=["myproject", "not_env"],
|
||||
... base_url="http://some.link.com/{version}/",
|
||||
... version="v0.1",
|
||||
... env="MYPROJECT_DATA_DIR")
|
||||
>>> print(pup.path.parts)
|
||||
('myproject', 'env', 'v0.1')
|
||||
|
||||
"""
|
||||
if version is not None:
|
||||
version = check_version(version, fallback=version_dev)
|
||||
base_url = base_url.format(version=version)
|
||||
# Don't create the cache folder here! This function is usually called in
|
||||
# the module context (at import time), so touching the file system is not
|
||||
# recommended. It could cause crashes when multiple processes/threads try
|
||||
# to import at the same time (which would try to create the folder several
|
||||
# times at once).
|
||||
path = cache_location(path, env, version)
|
||||
if isinstance(allow_updates, str):
|
||||
allow_updates = os.environ.get(allow_updates, "true").lower() != "false"
|
||||
# add trailing "/"
|
||||
base_url = base_url.rstrip("/") + "/"
|
||||
pup = Pooch(
|
||||
path=path,
|
||||
base_url=base_url,
|
||||
registry=registry,
|
||||
urls=urls,
|
||||
retry_if_failed=retry_if_failed,
|
||||
allow_updates=allow_updates,
|
||||
)
|
||||
return pup
|
||||
|
||||
|
||||
class Pooch:
|
||||
"""
|
||||
Manager for a local data storage that can fetch from a remote source.
|
||||
|
||||
Avoid creating ``Pooch`` instances directly. Use :func:`pooch.create`
|
||||
instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
The path to the local data storage folder. The path must exist in the
|
||||
file system.
|
||||
base_url : str
|
||||
Base URL for the remote data source. All requests will be made relative
|
||||
to this URL.
|
||||
registry : dict or None
|
||||
A record of the files that are managed by this good boy. Keys should be
|
||||
the file names and the values should be their hashes. Only files
|
||||
in the registry can be fetched from the local storage. Files in
|
||||
subdirectories of *path* **must use Unix-style separators** (``'/'``)
|
||||
even on Windows.
|
||||
urls : dict or None
|
||||
Custom URLs for downloading individual files in the registry. A
|
||||
dictionary with the file names as keys and the custom URLs as values.
|
||||
Not all files in *registry* need an entry in *urls*. If a file has an
|
||||
entry in *urls*, the *base_url* will be ignored when downloading it in
|
||||
favor of ``urls[fname]``.
|
||||
retry_if_failed : int
|
||||
Retry a file download the specified number of times if it fails because
|
||||
of a bad connection or a hash mismatch. By default, downloads are only
|
||||
attempted once (``retry_if_failed=0``). Initially, will wait for 1s
|
||||
between retries and then increase the wait time by 1s with each retry
|
||||
until a maximum of 10s.
|
||||
allow_updates : bool
|
||||
Whether existing files in local storage that have a hash mismatch with
|
||||
the registry are allowed to update from the remote URL. If ``False``,
|
||||
any mismatch with hashes in the registry will result in an error.
|
||||
Defaults to ``True``.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path: PathType,
|
||||
base_url: str,
|
||||
registry: Optional[dict[str, str]] = None,
|
||||
urls: Optional[dict[str, str]] = None,
|
||||
retry_if_failed: int = 0,
|
||||
allow_updates: bool = True,
|
||||
) -> None:
|
||||
self.path = path
|
||||
self.base_url = base_url
|
||||
if registry is None:
|
||||
registry = {}
|
||||
self.registry = registry
|
||||
if urls is None:
|
||||
urls = {}
|
||||
self.urls = dict(urls)
|
||||
self.retry_if_failed = retry_if_failed
|
||||
self.allow_updates = allow_updates
|
||||
|
||||
@property
|
||||
def abspath(self) -> Path:
|
||||
"Absolute path to the local storage"
|
||||
return Path(os.path.abspath(os.path.expanduser(str(self.path))))
|
||||
|
||||
@property
|
||||
def registry_files(self) -> list[str]:
|
||||
"List of file names on the registry"
|
||||
return list(self.registry)
|
||||
|
||||
def fetch(
|
||||
self,
|
||||
fname: str,
|
||||
processor: Optional[Processor] = None,
|
||||
downloader: Optional[Downloader] = None,
|
||||
progressbar: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Get the absolute path to a file in the local storage.
|
||||
|
||||
If it's not in the local storage, it will be downloaded. If the hash of
|
||||
the file in local storage doesn't match the one in the registry, will
|
||||
download a new copy of the file. This is considered a sign that the
|
||||
file was updated in the remote storage. If the hash of the downloaded
|
||||
file still doesn't match the one in the registry, will raise an
|
||||
exception to warn of possible file corruption.
|
||||
|
||||
Post-processing actions sometimes need to be taken on downloaded files
|
||||
(unzipping, conversion to a more efficient format, etc). If these
|
||||
actions are time or memory consuming, it would be best to do this only
|
||||
once right after the file is downloaded. Use the *processor* argument
|
||||
to specify a function that is executed after the download to perform
|
||||
these actions. See :ref:`processors` for details.
|
||||
|
||||
Custom file downloaders can be provided through the *downloader*
|
||||
argument. By default, Pooch will determine the download protocol from
|
||||
the URL in the registry. If the server for a given file requires
|
||||
authentication (username and password), use a downloader that support
|
||||
these features. Downloaders can also be used to print custom messages
|
||||
(like a progress bar), etc. See :ref:`downloaders` for details.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str
|
||||
The file name (relative to the *base_url* of the remote data
|
||||
storage) to fetch from the local storage.
|
||||
processor : None or callable
|
||||
If not None, then a function (or callable object) that will be
|
||||
called before returning the full path and after the file has been
|
||||
downloaded. See :ref:`processors` for details.
|
||||
downloader : None or callable
|
||||
If not None, then a function (or callable object) that will be
|
||||
called to download a given URL to a provided local file name. See
|
||||
:ref:`downloaders` for details.
|
||||
progressbar : bool or an arbitrary progress bar object
|
||||
If True, will print a progress bar of the download to standard
|
||||
error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
|
||||
be installed. Alternatively, an arbitrary progress bar object can
|
||||
be passed. See :ref:`custom-progressbar` for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
full_path : str
|
||||
The absolute path (including the file name) of the file in the
|
||||
local storage.
|
||||
|
||||
"""
|
||||
self._assert_file_in_registry(fname)
|
||||
|
||||
url = self.get_url(fname)
|
||||
full_path = self.abspath / fname
|
||||
known_hash = self.registry[fname]
|
||||
action, verb = download_action(full_path, known_hash)
|
||||
|
||||
if action == "update" and not self.allow_updates:
|
||||
raise ValueError(
|
||||
f"{fname} needs to update {full_path} but updates are disallowed."
|
||||
)
|
||||
|
||||
if action in ("download", "update"):
|
||||
# We need to write data, so create the local data directory if it
|
||||
# doesn't already exist.
|
||||
make_local_storage(str(self.abspath))
|
||||
|
||||
get_logger().info(
|
||||
"%s file '%s' from '%s' to '%s'.",
|
||||
verb,
|
||||
fname,
|
||||
url,
|
||||
str(self.abspath),
|
||||
)
|
||||
|
||||
if downloader is None:
|
||||
downloader = choose_downloader(url, progressbar=progressbar)
|
||||
|
||||
stream_download(
|
||||
url,
|
||||
full_path,
|
||||
known_hash,
|
||||
downloader,
|
||||
pooch=self,
|
||||
retry_if_failed=self.retry_if_failed,
|
||||
)
|
||||
|
||||
if processor is not None:
|
||||
return processor(str(full_path), action, self)
|
||||
|
||||
return str(full_path)
|
||||
|
||||
def _assert_file_in_registry(self, fname: str) -> None:
|
||||
"""
|
||||
Check if a file is in the registry and raise :class:`ValueError` if
|
||||
it's not.
|
||||
"""
|
||||
if fname not in self.registry:
|
||||
raise ValueError(f"File '{fname}' is not in the registry.")
|
||||
|
||||
def get_url(self, fname: str) -> str:
|
||||
"""
|
||||
Get the full URL to download a file in the registry.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str
|
||||
The file name (relative to the *base_url* of the remote data
|
||||
storage) to fetch from the local storage.
|
||||
|
||||
"""
|
||||
self._assert_file_in_registry(fname)
|
||||
return self.urls.get(fname, "".join([self.base_url, fname]))
|
||||
|
||||
def load_registry(self, fname: PathType) -> None:
|
||||
"""
|
||||
Load entries from a file and add them to the registry.
|
||||
|
||||
Use this if you are managing many files.
|
||||
|
||||
Each line of the file should have file name and its hash separated by
|
||||
a space. Hash can specify checksum algorithm using "alg:hash" format.
|
||||
In case no algorithm is provided, SHA256 is used by default.
|
||||
Only one file per line is allowed. Custom download URLs for individual
|
||||
files can be specified as a third element on the line. Line comments
|
||||
can be added and must be prepended with ``#``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str | fileobj
|
||||
Path (or open file object) to the registry file.
|
||||
|
||||
"""
|
||||
with contextlib.ExitStack() as stack:
|
||||
if hasattr(fname, "read"):
|
||||
# It's a file object
|
||||
fin: Any = fname
|
||||
else:
|
||||
# It's a file path
|
||||
fin = stack.enter_context(open(fname, encoding="utf-8"))
|
||||
|
||||
for linenum, line in enumerate(fin):
|
||||
if isinstance(line, bytes):
|
||||
line = line.decode("utf-8")
|
||||
|
||||
line = line.strip()
|
||||
# skip line comments
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
|
||||
elements = shlex.split(line)
|
||||
if not len(elements) in [0, 2, 3]:
|
||||
raise OSError(
|
||||
f"Invalid entry in Pooch registry file '{fname}': "
|
||||
f"expected 2 or 3 elements in line {linenum + 1} but got "
|
||||
f"{len(elements)}. Offending entry: '{line}'"
|
||||
)
|
||||
if elements:
|
||||
file_name = elements[0]
|
||||
file_checksum = elements[1]
|
||||
if len(elements) == 3:
|
||||
file_url = elements[2]
|
||||
self.urls[file_name] = file_url
|
||||
self.registry[file_name] = file_checksum.lower()
|
||||
|
||||
def load_registry_from_doi(self) -> None:
|
||||
"""
|
||||
Populate the registry using the data repository API
|
||||
|
||||
Fill the registry with all the files available in the data repository,
|
||||
along with their hashes. It will make a request to the data repository
|
||||
API to retrieve this information. No file is downloaded during this
|
||||
process.
|
||||
|
||||
.. important::
|
||||
|
||||
This method is intended to be used only when the ``base_url`` is
|
||||
a DOI.
|
||||
"""
|
||||
|
||||
# Ensure that this is indeed a DOI-based pooch
|
||||
downloader = choose_downloader(self.base_url)
|
||||
if not isinstance(downloader, DOIDownloader):
|
||||
raise ValueError(
|
||||
f"Invalid base_url '{self.base_url}': "
|
||||
+ "Pooch.load_registry_from_doi is only implemented for DOIs"
|
||||
)
|
||||
|
||||
# Create a repository instance
|
||||
doi = self.base_url.replace("doi:", "")
|
||||
repository = doi_to_repository(
|
||||
doi,
|
||||
headers=downloader.headers,
|
||||
timeout=downloader.timeout,
|
||||
**downloader.kwargs,
|
||||
)
|
||||
|
||||
# Call registry population for this repository
|
||||
return repository.populate_registry(self)
|
||||
|
||||
def is_available(self, fname: str, downloader: Optional[Downloader] = None):
|
||||
"""
|
||||
Check availability of a remote file without downloading it.
|
||||
|
||||
Use this method when working with large files to check if they are
|
||||
available for download.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str
|
||||
The file name (relative to the *base_url* of the remote data
|
||||
storage).
|
||||
downloader : None or callable
|
||||
If not None, then a function (or callable object) that will be
|
||||
called to check the availability of the file on the server. See
|
||||
:ref:`downloaders` for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
status : bool
|
||||
True if the file is available for download. False otherwise.
|
||||
|
||||
"""
|
||||
self._assert_file_in_registry(fname)
|
||||
url = self.get_url(fname)
|
||||
if downloader is None:
|
||||
downloader = choose_downloader(url)
|
||||
try:
|
||||
available = downloader(url, None, self, check_only=True)
|
||||
except TypeError as error:
|
||||
error_msg = (
|
||||
f"Downloader '{str(downloader)}' does not support availability checks."
|
||||
)
|
||||
raise NotImplementedError(error_msg) from error
|
||||
return available
|
||||
|
||||
|
||||
def download_action(path: Path, known_hash: Optional[str]) -> tuple[Action, str]:
|
||||
"""
|
||||
Determine the action that is needed to get the file on disk.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : PathLike
|
||||
The path to the file on disk.
|
||||
known_hash : str
|
||||
A known hash (checksum) of the file. Will be used to verify the
|
||||
download or check if an existing file needs to be updated. By default,
|
||||
will assume it's a SHA256 hash. To specify a different hashing method,
|
||||
prepend the hash with ``algorithm:``, for example
|
||||
``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
action, verb : str
|
||||
The action that must be taken and the English verb (infinitive form of
|
||||
*action*) used in the log:
|
||||
* ``'download'``: File does not exist locally and must be downloaded.
|
||||
* ``'update'``: File exists locally but needs to be updated.
|
||||
* ``'fetch'``: File exists locally and only need to inform its path.
|
||||
|
||||
|
||||
"""
|
||||
if not path.exists():
|
||||
return "download", "Downloading"
|
||||
if not hash_matches(str(path), known_hash):
|
||||
return "update", "Updating"
|
||||
return "fetch", "Fetching"
|
||||
|
||||
|
||||
def stream_download(
|
||||
url: str,
|
||||
fname: Path,
|
||||
known_hash: Optional[str],
|
||||
downloader: Downloader,
|
||||
pooch: Optional[Pooch] = None,
|
||||
retry_if_failed: int = 0,
|
||||
) -> None:
|
||||
"""
|
||||
Stream the file and check that its hash matches the known one.
|
||||
|
||||
The file is first downloaded to a temporary file name in the cache folder.
|
||||
It will be moved to the desired file name only if the hash matches the
|
||||
known hash. Otherwise, the temporary file is deleted.
|
||||
|
||||
If the download fails for either a bad connection or a hash mismatch, we
|
||||
will retry the download the specified number of times in case the failure
|
||||
was due to a network error.
|
||||
"""
|
||||
# Lazy import requests to speed up import time
|
||||
import requests.exceptions # pylint: disable=C0415
|
||||
|
||||
# Ensure the parent directory exists in case the file is in a subdirectory.
|
||||
# Otherwise, move will cause an error.
|
||||
if not fname.parent.exists():
|
||||
os.makedirs(str(fname.parent))
|
||||
download_attempts = 1 + retry_if_failed
|
||||
max_wait = 10
|
||||
for i in range(download_attempts):
|
||||
try:
|
||||
# Stream the file to a temporary so that we can safely check its
|
||||
# hash before overwriting the original.
|
||||
with temporary_file(path=str(fname.parent)) as tmp:
|
||||
downloader(url, tmp, pooch)
|
||||
hash_matches(tmp, known_hash, strict=True, source=str(fname.name))
|
||||
shutil.move(tmp, str(fname))
|
||||
break
|
||||
except (ValueError, requests.exceptions.RequestException):
|
||||
if i == download_attempts - 1:
|
||||
raise
|
||||
retries_left = download_attempts - (i + 1)
|
||||
get_logger().info(
|
||||
"Failed to download '%s'. "
|
||||
"Will attempt the download again %d more time%s.",
|
||||
str(fname.name),
|
||||
retries_left,
|
||||
"s" if retries_left > 1 else "",
|
||||
)
|
||||
time.sleep(min(i + 1, max_wait))
|
||||
1219
linedance-app/venv/lib/python3.12/site-packages/pooch/downloaders.py
Normal file
1219
linedance-app/venv/lib/python3.12/site-packages/pooch/downloaders.py
Normal file
File diff suppressed because it is too large
Load Diff
228
linedance-app/venv/lib/python3.12/site-packages/pooch/hashes.py
Normal file
228
linedance-app/venv/lib/python3.12/site-packages/pooch/hashes.py
Normal file
@@ -0,0 +1,228 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Calculating and checking file hashes.
|
||||
"""
|
||||
import hashlib
|
||||
import functools
|
||||
from pathlib import Path
|
||||
|
||||
# From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new
|
||||
# The named constructors are much faster than new() and should be
|
||||
# preferred.
|
||||
# Need to fallback on new() for some algorithms.
|
||||
ALGORITHMS_AVAILABLE = {
|
||||
alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg))
|
||||
for alg in hashlib.algorithms_available
|
||||
}
|
||||
|
||||
try:
|
||||
import xxhash
|
||||
|
||||
# xxhash doesn't have a list of available algorithms yet.
|
||||
# https://github.com/ifduyue/python-xxhash/issues/48
|
||||
ALGORITHMS_AVAILABLE.update(
|
||||
{
|
||||
alg: getattr(xxhash, alg, None)
|
||||
for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"]
|
||||
}
|
||||
)
|
||||
# The xxh3 algorithms are only available for version>=2.0. Set to None and
|
||||
# remove to ensure backwards compatibility.
|
||||
ALGORITHMS_AVAILABLE = {
|
||||
alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def file_hash(fname, alg="sha256"):
|
||||
"""
|
||||
Calculate the hash of a given file.
|
||||
|
||||
Useful for checking if a file has changed or been corrupted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str
|
||||
The name of the file.
|
||||
alg : str
|
||||
The type of the hashing algorithm
|
||||
|
||||
Returns
|
||||
-------
|
||||
hash : str
|
||||
The hash of the file.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> fname = "test-file-for-hash.txt"
|
||||
>>> with open(fname, "w") as f:
|
||||
... __ = f.write("content of the file")
|
||||
>>> print(file_hash(fname))
|
||||
0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
|
||||
>>> import os
|
||||
>>> os.remove(fname)
|
||||
|
||||
"""
|
||||
if alg not in ALGORITHMS_AVAILABLE:
|
||||
raise ValueError(
|
||||
f"Algorithm '{alg}' not available to the pooch library. "
|
||||
"Only the following algorithms are available "
|
||||
f"{list(ALGORITHMS_AVAILABLE.keys())}."
|
||||
)
|
||||
# Calculate the hash in chunks to avoid overloading the memory
|
||||
chunksize = 65536
|
||||
# For hashlib algorithms, use usedforsecurity=False to support FIPS-enabled
|
||||
# systems. xxhash algorithms don't support this parameter.
|
||||
hasher = (
|
||||
ALGORITHMS_AVAILABLE[alg](usedforsecurity=False)
|
||||
if alg in hashlib.algorithms_available
|
||||
else ALGORITHMS_AVAILABLE[alg]()
|
||||
)
|
||||
with open(fname, "rb") as fin:
|
||||
buff = fin.read(chunksize)
|
||||
while buff:
|
||||
hasher.update(buff)
|
||||
buff = fin.read(chunksize)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def hash_algorithm(hash_string):
|
||||
"""
|
||||
Parse the name of the hash method from the hash string.
|
||||
|
||||
The hash string should have the following form ``algorithm:hash``, where
|
||||
algorithm can be the name of any algorithm known to :mod:`hashlib`.
|
||||
|
||||
If the algorithm is omitted or the hash string is None, will default to
|
||||
``"sha256"``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hash_string : str
|
||||
The hash string with optional algorithm prepended.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hash_algorithm : str
|
||||
The name of the algorithm.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
||||
sha256
|
||||
>>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
||||
md5
|
||||
>>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
||||
sha256
|
||||
>>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
||||
sha256
|
||||
>>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
|
||||
xxh3_64
|
||||
>>> print(hash_algorithm(None))
|
||||
sha256
|
||||
|
||||
"""
|
||||
default = "sha256"
|
||||
if hash_string is None:
|
||||
algorithm = default
|
||||
elif ":" not in hash_string:
|
||||
algorithm = default
|
||||
else:
|
||||
algorithm = hash_string.split(":")[0]
|
||||
return algorithm.lower()
|
||||
|
||||
|
||||
def hash_matches(fname, known_hash, strict=False, source=None):
|
||||
"""
|
||||
Check if the hash of a file matches a known hash.
|
||||
|
||||
If the *known_hash* is None, will always return True.
|
||||
|
||||
Coverts hashes to lowercase before comparison to avoid system specific
|
||||
mismatches between hashes in the registry and computed hashes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str or PathLike
|
||||
The path to the file.
|
||||
known_hash : str
|
||||
The known hash. Optionally, prepend ``alg:`` to the hash to specify the
|
||||
hashing algorithm. Default is SHA256.
|
||||
strict : bool
|
||||
If True, will raise a :class:`ValueError` if the hash does not match
|
||||
informing the user that the file may be corrupted.
|
||||
source : str
|
||||
The source of the downloaded file (name or URL, for example). Will be
|
||||
used in the error message if *strict* is True. Has no other use other
|
||||
than reporting to the user where the file came from in case of hash
|
||||
mismatch. If None, will default to *fname*.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_same : bool
|
||||
True if the hash matches, False otherwise.
|
||||
|
||||
"""
|
||||
if known_hash is None:
|
||||
return True
|
||||
algorithm = hash_algorithm(known_hash)
|
||||
new_hash = file_hash(fname, alg=algorithm)
|
||||
matches = new_hash.lower() == known_hash.split(":")[-1].lower()
|
||||
if strict and not matches:
|
||||
if source is None:
|
||||
source = str(fname)
|
||||
raise ValueError(
|
||||
f"{algorithm.upper()} hash of downloaded file ({source}) does not match"
|
||||
f" the known hash: expected {known_hash} but got {new_hash}. Deleted"
|
||||
" download for safety. The downloaded file may have been corrupted or"
|
||||
" the known hash may be outdated."
|
||||
)
|
||||
return matches
|
||||
|
||||
|
||||
def make_registry(directory, output, recursive=True):
|
||||
"""
|
||||
Make a registry of files and hashes for the given directory.
|
||||
|
||||
This is helpful if you have many files in your test dataset as it keeps you
|
||||
from needing to manually update the registry.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
directory : str
|
||||
Directory of the test data to put in the registry. All file names in
|
||||
the registry will be relative to this directory.
|
||||
output : str
|
||||
Name of the output registry file.
|
||||
recursive : bool
|
||||
If True, will recursively look for files in subdirectories of
|
||||
*directory*.
|
||||
|
||||
"""
|
||||
directory = Path(directory)
|
||||
if recursive:
|
||||
pattern = "**/*"
|
||||
else:
|
||||
pattern = "*"
|
||||
|
||||
files = sorted(
|
||||
str(path.relative_to(directory))
|
||||
for path in directory.glob(pattern)
|
||||
if path.is_file()
|
||||
)
|
||||
|
||||
hashes = [file_hash(str(directory / fname)) for fname in files]
|
||||
|
||||
with open(output, "w", encoding="utf-8") as outfile:
|
||||
for fname, fhash in zip(files, hashes):
|
||||
# Only use Unix separators for the registry so that we don't go
|
||||
# insane dealing with file paths.
|
||||
outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash))
|
||||
@@ -0,0 +1,415 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
# pylint: disable=line-too-long
|
||||
"""
|
||||
Post-processing hooks
|
||||
"""
|
||||
|
||||
import abc
|
||||
import os
|
||||
import bz2
|
||||
import gzip
|
||||
import lzma
|
||||
import shutil
|
||||
import sys
|
||||
from zipfile import ZipFile
|
||||
from tarfile import TarFile
|
||||
|
||||
from .utils import get_logger
|
||||
|
||||
|
||||
class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods
|
||||
"""
|
||||
Abstract base class for extractions from compressed archives.
|
||||
|
||||
Subclasses can be used with :meth:`pooch.Pooch.fetch` and
|
||||
:func:`pooch.retrieve` to unzip a downloaded data file into a folder in the
|
||||
local data store. :meth:`~pooch.Pooch.fetch` will return a list with the
|
||||
names of the extracted files instead of the archive.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
members : list or None
|
||||
If None, will unpack all files in the archive. Otherwise, *members*
|
||||
must be a list of file names to unpack from the archive. Only these
|
||||
files will be unpacked.
|
||||
extract_dir : str or None
|
||||
If None, files will be unpacked to the default location (a folder in
|
||||
the same location as the downloaded zip file, with a suffix added).
|
||||
Otherwise, files will be unpacked to ``extract_dir``, which is
|
||||
interpreted as a *relative path* (relative to the cache location
|
||||
provided by :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch`).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, members=None, extract_dir=None):
|
||||
self.members = members
|
||||
self.extract_dir = extract_dir
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def suffix(self):
|
||||
"""
|
||||
String appended to unpacked archive folder name.
|
||||
Only used if extract_dir is None.
|
||||
MUST BE IMPLEMENTED BY CHILD CLASSES.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def _all_members(self, fname):
|
||||
"""
|
||||
Return all the members in the archive.
|
||||
MUST BE IMPLEMENTED BY CHILD CLASSES.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def _extract_file(self, fname, extract_dir):
|
||||
"""
|
||||
This method receives an argument for the archive to extract and the
|
||||
destination path.
|
||||
MUST BE IMPLEMENTED BY CHILD CLASSES.
|
||||
"""
|
||||
|
||||
def __call__(self, fname, action, pooch):
|
||||
"""
|
||||
Extract all files from the given archive.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str
|
||||
Full path of the zipped file in local storage.
|
||||
action : str
|
||||
Indicates what action was taken by :meth:`pooch.Pooch.fetch` or
|
||||
:func:`pooch.retrieve`:
|
||||
|
||||
* ``"download"``: File didn't exist locally and was downloaded
|
||||
* ``"update"``: Local file was outdated and was re-download
|
||||
* ``"fetch"``: File exists and is updated so it wasn't downloaded
|
||||
|
||||
pooch : :class:`pooch.Pooch`
|
||||
The instance of :class:`pooch.Pooch` that is calling this.
|
||||
|
||||
Returns
|
||||
-------
|
||||
fnames : list of str
|
||||
A list of the full path to all files in the extracted archive.
|
||||
|
||||
"""
|
||||
if self.extract_dir is None:
|
||||
self.extract_dir = fname + self.suffix
|
||||
else:
|
||||
archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0]
|
||||
self.extract_dir = os.path.join(archive_dir, self.extract_dir)
|
||||
# Get a list of everyone who is supposed to be in the unpacked folder
|
||||
# so we can check if they are all there or if we need to extract new
|
||||
# files.
|
||||
if self.members is None or not self.members:
|
||||
members = self._all_members(fname)
|
||||
else:
|
||||
members = self.members
|
||||
if (
|
||||
(action in ("update", "download"))
|
||||
or (not os.path.exists(self.extract_dir))
|
||||
or not all(
|
||||
os.path.exists(os.path.join(self.extract_dir, m)) for m in members
|
||||
)
|
||||
):
|
||||
# Make sure that the folder with the extracted files exists
|
||||
os.makedirs(self.extract_dir, exist_ok=True)
|
||||
self._extract_file(fname, self.extract_dir)
|
||||
|
||||
# Get a list of all file names (including subdirectories) in our folder
|
||||
# of unzipped files, filtered by the given members list
|
||||
fnames = []
|
||||
for path, _, files in os.walk(self.extract_dir):
|
||||
for filename in files:
|
||||
relpath = os.path.normpath(
|
||||
os.path.join(os.path.relpath(path, self.extract_dir), filename)
|
||||
)
|
||||
if self.members is None or any(
|
||||
relpath.startswith(os.path.normpath(m)) for m in self.members
|
||||
):
|
||||
fnames.append(os.path.join(path, filename))
|
||||
|
||||
return fnames
|
||||
|
||||
|
||||
class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods
|
||||
"""
|
||||
Processor that unpacks a zip archive and returns a list of all files.
|
||||
|
||||
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to unzip a
|
||||
downloaded data file into a folder in the local data store. The
|
||||
method/function will return a list with the names of the unzipped files
|
||||
instead of the zip archive.
|
||||
|
||||
The output folder is ``{fname}.unzip``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
members : list or None
|
||||
If None, will unpack all files in the zip archive. Otherwise, *members*
|
||||
must be a list of file names to unpack from the archive. Only these
|
||||
files will be unpacked.
|
||||
extract_dir : str or None
|
||||
If None, files will be unpacked to the default location (a folder in
|
||||
the same location as the downloaded zip file, with the suffix
|
||||
``.unzip`` added). Otherwise, files will be unpacked to
|
||||
``extract_dir``, which is interpreted as a *relative path* (relative to
|
||||
the cache location provided by :func:`pooch.retrieve` or
|
||||
:meth:`pooch.Pooch.fetch`).
|
||||
|
||||
"""
|
||||
|
||||
@property
|
||||
def suffix(self):
|
||||
"""
|
||||
String appended to unpacked archive folder name.
|
||||
Only used if extract_dir is None.
|
||||
"""
|
||||
return ".unzip"
|
||||
|
||||
def _all_members(self, fname):
|
||||
"""Return all members from a given archive."""
|
||||
with ZipFile(fname, "r") as zip_file:
|
||||
return zip_file.namelist()
|
||||
|
||||
def _extract_file(self, fname, extract_dir):
|
||||
"""
|
||||
This method receives an argument for the archive to extract and the
|
||||
destination path.
|
||||
"""
|
||||
with ZipFile(fname, "r") as zip_file:
|
||||
if self.members is None:
|
||||
get_logger().info(
|
||||
"Unzipping contents of '%s' to '%s'", fname, extract_dir
|
||||
)
|
||||
# Unpack all files from the archive into our new folder
|
||||
zip_file.extractall(path=extract_dir)
|
||||
else:
|
||||
for member in self.members:
|
||||
get_logger().info(
|
||||
"Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
|
||||
)
|
||||
# If the member is a dir, we need to get the names of the
|
||||
# elements it contains for extraction (ZipFile does not
|
||||
# support dirs on .extract). If it's not a dir, this will
|
||||
# only include the member itself.
|
||||
# Based on:
|
||||
# https://stackoverflow.com/questions/8008829/extract-only-a-single-directory-from-tar
|
||||
subdir_members = [
|
||||
name
|
||||
for name in zip_file.namelist()
|
||||
if os.path.normpath(name).startswith(os.path.normpath(member))
|
||||
]
|
||||
# Extract the data file from within the archive
|
||||
zip_file.extractall(members=subdir_members, path=extract_dir)
|
||||
|
||||
|
||||
class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods
|
||||
"""
|
||||
Processor that unpacks a tar archive and returns a list of all files.
|
||||
|
||||
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to untar a
|
||||
downloaded data file into a folder in the local data store. The
|
||||
method/function will return a list with the names of the extracted files
|
||||
instead of the archive.
|
||||
|
||||
The output folder is ``{fname}.untar``.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
members : list or None
|
||||
If None, will unpack all files in the archive. Otherwise, *members*
|
||||
must be a list of file names to unpack from the archive. Only these
|
||||
files will be unpacked.
|
||||
extract_dir : str or None
|
||||
If None, files will be unpacked to the default location (a folder in
|
||||
the same location as the downloaded tar file, with the suffix
|
||||
``.untar`` added). Otherwise, files will be unpacked to
|
||||
``extract_dir``, which is interpreted as a *relative path* (relative to
|
||||
the cache location provided by :func:`pooch.retrieve` or
|
||||
:meth:`pooch.Pooch.fetch`).
|
||||
"""
|
||||
|
||||
@property
|
||||
def suffix(self):
|
||||
"""
|
||||
String appended to unpacked archive folder name.
|
||||
Only used if extract_dir is None.
|
||||
"""
|
||||
return ".untar"
|
||||
|
||||
def _all_members(self, fname):
|
||||
"""Return all members from a given archive."""
|
||||
with TarFile.open(fname, "r") as tar_file:
|
||||
return [info.name for info in tar_file.getmembers()]
|
||||
|
||||
def _extract_file(self, fname, extract_dir):
|
||||
"""
|
||||
This method receives an argument for the archive to extract and the
|
||||
destination path.
|
||||
"""
|
||||
filter_kwarg = {} if sys.version_info < (3, 12) else {"filter": "data"}
|
||||
with TarFile.open(fname, "r") as tar_file:
|
||||
if self.members is None:
|
||||
get_logger().info(
|
||||
"Untarring contents of '%s' to '%s'", fname, extract_dir
|
||||
)
|
||||
# Unpack all files from the archive into our new folder
|
||||
tar_file.extractall(path=extract_dir, **filter_kwarg)
|
||||
else:
|
||||
for member in self.members:
|
||||
get_logger().info(
|
||||
"Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
|
||||
)
|
||||
# If the member is a dir, we need to get the names of the
|
||||
# elements it contains for extraction (TarFile does not
|
||||
# support dirs on .extract). If it's not a dir, this will
|
||||
# only include the member itself.
|
||||
# Based on:
|
||||
# https://stackoverflow.com/questions/8008829/extract-only-a-single-directory-from-tar
|
||||
# Can't use .getnames because extractall expects TarInfo
|
||||
# objects.
|
||||
subdir_members = [
|
||||
info
|
||||
for info in tar_file.getmembers()
|
||||
if os.path.normpath(info.name).startswith(
|
||||
os.path.normpath(member)
|
||||
)
|
||||
]
|
||||
# Extract the data file from within the archive
|
||||
tar_file.extractall(
|
||||
members=subdir_members, path=extract_dir, **filter_kwarg
|
||||
)
|
||||
|
||||
|
||||
class Decompress: # pylint: disable=too-few-public-methods
|
||||
"""
|
||||
Processor that decompress a file and returns the decompressed version.
|
||||
|
||||
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to decompress
|
||||
a downloaded data file so that it can be easily opened. Useful for data
|
||||
files that take a long time to decompress (exchanging disk space for
|
||||
speed).
|
||||
|
||||
Supported decompression methods are LZMA (``.xz``), bzip2 (``.bz2``), and
|
||||
gzip (``.gz``).
|
||||
|
||||
File names with the standard extensions (see above) can use
|
||||
``method="auto"`` to automatically determine the compression method. This
|
||||
can be overwritten by setting the *method* argument.
|
||||
|
||||
.. note::
|
||||
|
||||
To unpack zip and tar archives with one or more files, use
|
||||
:class:`pooch.Unzip` and :class:`pooch.Untar` instead.
|
||||
|
||||
The output file is ``{fname}.decomp`` by default but it can be changed by
|
||||
setting the ``name`` parameter.
|
||||
|
||||
.. warning::
|
||||
|
||||
Passing in ``name`` can cause existing data to be lost! For example, if
|
||||
a file already exists with the specified name it will be overwritten
|
||||
with the new decompressed file content. **Use this option with
|
||||
caution.**
|
||||
|
||||
Parameters
|
||||
----------
|
||||
method : str
|
||||
Name of the compression method. Can be "auto", "lzma", "xz", "bzip2",
|
||||
or "gzip".
|
||||
name : None or str
|
||||
Defines the decompressed file name. The file name will be
|
||||
``{fname}.decomp`` if ``None`` (default) or the given name otherwise.
|
||||
Note that the name should **not** include the full (or relative) path,
|
||||
it should be just the file name itself.
|
||||
|
||||
"""
|
||||
|
||||
modules = {"auto": None, "lzma": lzma, "xz": lzma, "gzip": gzip, "bzip2": bz2}
|
||||
extensions = {".xz": "lzma", ".gz": "gzip", ".bz2": "bzip2"}
|
||||
|
||||
def __init__(self, method="auto", name=None):
|
||||
self.method = method
|
||||
self.name = name
|
||||
|
||||
def __call__(self, fname, action, pooch):
|
||||
"""
|
||||
Decompress the given file.
|
||||
|
||||
The output file will be either ``{fname}.decomp`` or the given *name*
|
||||
class attribute.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fname : str
|
||||
Full path of the compressed file in local storage.
|
||||
action : str
|
||||
Indicates what action was taken by :meth:`pooch.Pooch.fetch` or
|
||||
:func:`pooch.retrieve`:
|
||||
|
||||
- ``"download"``: File didn't exist locally and was downloaded
|
||||
- ``"update"``: Local file was outdated and was re-download
|
||||
- ``"fetch"``: File exists and is updated so it wasn't downloaded
|
||||
|
||||
pooch : :class:`pooch.Pooch`
|
||||
The instance of :class:`pooch.Pooch` that is calling this.
|
||||
|
||||
Returns
|
||||
-------
|
||||
fname : str
|
||||
The full path to the decompressed file.
|
||||
"""
|
||||
if self.name is None:
|
||||
decompressed = fname + ".decomp"
|
||||
else:
|
||||
decompressed = os.path.join(os.path.dirname(fname), self.name)
|
||||
if action in ("update", "download") or not os.path.exists(decompressed):
|
||||
get_logger().info(
|
||||
"Decompressing '%s' to '%s' using method '%s'.",
|
||||
fname,
|
||||
decompressed,
|
||||
self.method,
|
||||
)
|
||||
module = self._compression_module(fname)
|
||||
with open(decompressed, "w+b") as output:
|
||||
with module.open(fname) as compressed:
|
||||
shutil.copyfileobj(compressed, output)
|
||||
return decompressed
|
||||
|
||||
def _compression_module(self, fname):
|
||||
"""
|
||||
Get the Python module compatible with fname and the chosen method.
|
||||
|
||||
If the *method* attribute is "auto", will select a method based on the
|
||||
extension. If no recognized extension is in the file name, will raise a
|
||||
ValueError.
|
||||
"""
|
||||
error_archives = "To unpack zip/tar archives, use pooch.Unzip/Untar instead."
|
||||
if self.method not in self.modules:
|
||||
message = (
|
||||
f"Invalid compression method '{self.method}'. "
|
||||
f"Must be one of '{list(self.modules.keys())}'."
|
||||
)
|
||||
if self.method in {"zip", "tar"}:
|
||||
message = " ".join([message, error_archives])
|
||||
raise ValueError(message)
|
||||
if self.method == "auto":
|
||||
ext = os.path.splitext(fname)[-1]
|
||||
if ext not in self.extensions:
|
||||
message = (
|
||||
f"Unrecognized file extension '{ext}'. "
|
||||
f"Must be one of '{list(self.extensions.keys())}'."
|
||||
)
|
||||
if ext in {".zip", ".tar"}:
|
||||
message = " ".join([message, error_archives])
|
||||
raise ValueError(message)
|
||||
return self.modules[self.extensions[ext]]
|
||||
return self.modules[self.method]
|
||||
@@ -0,0 +1,6 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,10 @@
|
||||
subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
tiny-data.zip 0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb
|
||||
large-data.txt 98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a
|
||||
store.zip 0498d2a001e71051bbd2acd2346f38da7cbd345a633cb7bf0f8a20938714b51a
|
||||
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d https://some-site/tiny-data.txt
|
||||
tiny-data.tar.gz 41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b
|
||||
store.tar.gz 088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511
|
||||
tiny-data.txt.bz2 753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306
|
||||
tiny-data.txt.gz 2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52
|
||||
tiny-data.txt.xz 99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765
|
||||
@@ -0,0 +1,2 @@
|
||||
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
some-file.txt second_element third_element forth_element
|
||||
@@ -0,0 +1,2 @@
|
||||
"file with spaces.txt" baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
other\ with\ spaces.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
@@ -0,0 +1,12 @@
|
||||
subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
large-data.txt 98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a
|
||||
tiny-data.zip 0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb
|
||||
|
||||
store.zip 0498D2A001E71051BBD2ACD2346F38DA7CBD345A633CB7BF0F8A20938714B51A
|
||||
tiny-data.tar.gz 41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b
|
||||
|
||||
store.tar.gz 088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511
|
||||
tiny-data.txt.bz2 753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306
|
||||
tiny-data.txt.gz 2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52
|
||||
tiny-data.txt.xz 99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765
|
||||
@@ -0,0 +1,14 @@
|
||||
# a comment
|
||||
subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
|
||||
large-data.txt 98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a
|
||||
tiny-data.zip 0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb
|
||||
|
||||
# a comment with a starting space
|
||||
store.zip 0498d2a001e71051bbd2acd2346f38da7cbd345a633cb7bf0f8a20938714b51a
|
||||
tiny-data.tar.gz 41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b
|
||||
|
||||
store.tar.gz 088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511
|
||||
tiny-data.txt.bz2 753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306
|
||||
tiny-data.txt.gz 2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52
|
||||
tiny-data.txt.xz 99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,2 @@
|
||||
# A tiny data file for test purposes only
|
||||
1 2 3 4 5 6
|
||||
@@ -0,0 +1,2 @@
|
||||
# A tiny data file for test purposes only
|
||||
1 2 3 4 5 6
|
||||
Binary file not shown.
@@ -0,0 +1,2 @@
|
||||
# A tiny data file for test purposes only
|
||||
1 2 3 4 5 6
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,689 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
# pylint: disable=redefined-outer-name
|
||||
"""
|
||||
Test the core class and factory function.
|
||||
"""
|
||||
import hashlib
|
||||
import os
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pytest
|
||||
|
||||
from ..core import create, Pooch, retrieve, download_action, stream_download
|
||||
from ..utils import get_logger, temporary_file, os_cache
|
||||
from ..hashes import file_hash, hash_matches
|
||||
|
||||
# Import the core module so that we can monkeypatch some functions
|
||||
from .. import core
|
||||
from ..downloaders import HTTPDownloader, FTPDownloader
|
||||
|
||||
from .utils import (
|
||||
pooch_test_url,
|
||||
data_over_ftp,
|
||||
pooch_test_figshare_url,
|
||||
pooch_test_zenodo_url,
|
||||
pooch_test_zenodo_with_slash_url,
|
||||
pooch_test_dataverse_url,
|
||||
pooch_test_registry,
|
||||
check_tiny_data,
|
||||
check_large_data,
|
||||
capture_log,
|
||||
mirror_directory,
|
||||
)
|
||||
|
||||
DATA_DIR = str(Path(__file__).parent / "data")
|
||||
REGISTRY = pooch_test_registry()
|
||||
BASEURL = pooch_test_url()
|
||||
FIGSHAREURL = pooch_test_figshare_url()
|
||||
ZENODOURL = pooch_test_zenodo_url()
|
||||
ZENODOURL_W_SLASH = pooch_test_zenodo_with_slash_url()
|
||||
DATAVERSEURL = pooch_test_dataverse_url()
|
||||
REGISTRY_CORRUPTED = {
|
||||
# The same data file but I changed the hash manually to a wrong one
|
||||
"tiny-data.txt": "098h0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d"
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_dir_mirror(tmp_path):
|
||||
"""
|
||||
Mirror the test data folder on a temporary directory. Needed to avoid
|
||||
permission errors when pooch is installed on a non-writable path.
|
||||
"""
|
||||
return mirror_directory(DATA_DIR, tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_retrieve():
|
||||
"Try downloading some data with retrieve"
|
||||
with TemporaryDirectory() as local_store:
|
||||
data_file = "tiny-data.txt"
|
||||
url = BASEURL + data_file
|
||||
# Check that the logs say that the file is being downloaded
|
||||
with capture_log() as log_file:
|
||||
fname = retrieve(url, known_hash=None, path=local_store)
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Downloading"
|
||||
assert "SHA256 hash of downloaded file:" in logs
|
||||
assert REGISTRY[data_file] in logs
|
||||
# Check that the downloaded file has the right content
|
||||
assert data_file == fname[-len(data_file) :]
|
||||
check_tiny_data(fname)
|
||||
assert file_hash(fname) == REGISTRY[data_file]
|
||||
# Check that no logging happens when not downloading
|
||||
with capture_log() as log_file:
|
||||
fname = retrieve(url, known_hash=None, path=local_store)
|
||||
assert log_file.getvalue() == ""
|
||||
with capture_log() as log_file:
|
||||
fname = retrieve(url, known_hash=REGISTRY[data_file], path=local_store)
|
||||
assert log_file.getvalue() == ""
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_retrieve_fname():
|
||||
"Try downloading some data with retrieve and setting the file name"
|
||||
with TemporaryDirectory() as local_store:
|
||||
data_file = "tiny-data.txt"
|
||||
url = BASEURL + data_file
|
||||
# Check that the logs say that the file is being downloaded
|
||||
with capture_log() as log_file:
|
||||
fname = retrieve(url, known_hash=None, path=local_store, fname=data_file)
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Downloading"
|
||||
assert "SHA256 hash of downloaded file:" in logs
|
||||
assert REGISTRY[data_file] in logs
|
||||
# Check that the downloaded file has the right name and content
|
||||
assert data_file == os.path.split(fname)[1]
|
||||
check_tiny_data(fname)
|
||||
assert file_hash(fname) == REGISTRY[data_file]
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_retrieve_default_path():
|
||||
"Try downloading some data with retrieve to the default cache location"
|
||||
data_file = "tiny-data.txt"
|
||||
url = BASEURL + data_file
|
||||
expected_location = os_cache("pooch") / data_file
|
||||
try:
|
||||
# Check that the logs say that the file is being downloaded
|
||||
with capture_log() as log_file:
|
||||
fname = retrieve(url, known_hash=None, fname=data_file)
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Downloading"
|
||||
assert str(os_cache("pooch").resolve()) in logs
|
||||
assert "SHA256 hash of downloaded file" in logs
|
||||
assert REGISTRY[data_file] in logs
|
||||
# Check that the downloaded file has the right content
|
||||
assert fname == str(expected_location.resolve())
|
||||
check_tiny_data(fname)
|
||||
assert file_hash(fname) == REGISTRY[data_file]
|
||||
finally:
|
||||
if os.path.exists(str(expected_location)):
|
||||
os.remove(str(expected_location))
|
||||
|
||||
|
||||
def test_pooch_local(data_dir_mirror):
|
||||
"Setup a pooch that already has the local data and test the fetch."
|
||||
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry=REGISTRY)
|
||||
true = str(data_dir_mirror / "tiny-data.txt")
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
assert true == fname
|
||||
check_tiny_data(fname)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
BASEURL,
|
||||
pytest.param(FIGSHAREURL, marks=pytest.mark.figshare),
|
||||
ZENODOURL,
|
||||
DATAVERSEURL,
|
||||
],
|
||||
ids=["https", "figshare", "zenodo", "dataverse"],
|
||||
)
|
||||
def test_pooch_custom_url(url):
|
||||
"Have pooch download the file from URL that is not base_url"
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
urls = {"tiny-data.txt": url + "tiny-data.txt"}
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(path=path, base_url="", registry=REGISTRY, urls=urls)
|
||||
# Check that the logs say that the file is being downloaded
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Downloading"
|
||||
assert logs.split()[-1] == f"'{path}'."
|
||||
check_tiny_data(fname)
|
||||
# Check that no logging happens when there are no events
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
assert log_file.getvalue() == ""
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
BASEURL,
|
||||
pytest.param(FIGSHAREURL, marks=pytest.mark.figshare),
|
||||
ZENODOURL,
|
||||
DATAVERSEURL,
|
||||
],
|
||||
ids=["https", "figshare", "zenodo", "dataverse"],
|
||||
)
|
||||
def test_pooch_download(url):
|
||||
"Setup a pooch that has no local data and needs to download"
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
true_path = str(path / "tiny-data.txt")
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(path=path, base_url=url, registry=REGISTRY)
|
||||
# Check that the logs say that the file is being downloaded
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Downloading"
|
||||
assert logs.split()[-1] == f"'{path}'."
|
||||
# Check that the downloaded file has the right content
|
||||
assert true_path == fname
|
||||
check_tiny_data(fname)
|
||||
assert file_hash(fname) == REGISTRY["tiny-data.txt"]
|
||||
# Check that no logging happens when not downloading
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
assert log_file.getvalue() == ""
|
||||
|
||||
|
||||
class FakeHashMatches: # pylint: disable=too-few-public-methods
|
||||
"Create a fake version of hash_matches that fails n times"
|
||||
|
||||
def __init__(self, nfailures):
|
||||
self.nfailures = nfailures
|
||||
self.failed = 0
|
||||
|
||||
def hash_matches(self, *args, **kwargs):
|
||||
"Fail n times before finally passing"
|
||||
if self.failed < self.nfailures:
|
||||
self.failed += 1
|
||||
# Give it an invalid hash to force a failure
|
||||
return hash_matches(args[0], "bla", **kwargs)
|
||||
return hash_matches(*args, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_pooch_download_retry_off_by_default(monkeypatch):
|
||||
"Check that retrying the download is off by default"
|
||||
with TemporaryDirectory() as local_store:
|
||||
monkeypatch.setattr(core, "hash_matches", FakeHashMatches(3).hash_matches)
|
||||
# Setup a pooch without download retrying
|
||||
path = Path(local_store)
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
|
||||
# Make sure it fails with no retries
|
||||
with pytest.raises(ValueError) as error:
|
||||
with capture_log() as log_file:
|
||||
pup.fetch("tiny-data.txt")
|
||||
assert "does not match the known hash" in str(error)
|
||||
# Check that the log doesn't have the download retry message
|
||||
logs = log_file.getvalue().strip().split("\n")
|
||||
assert len(logs) == 1
|
||||
assert logs[0].startswith("Downloading")
|
||||
assert logs[0].endswith(f"'{path}'.")
|
||||
|
||||
|
||||
class FakeSleep: # pylint: disable=too-few-public-methods
|
||||
"Create a fake version of sleep that logs the specified times"
|
||||
|
||||
def __init__(self):
|
||||
self.times = []
|
||||
|
||||
def sleep(self, secs):
|
||||
"Store the time and doesn't sleep"
|
||||
self.times.append(secs)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_pooch_download_retry(monkeypatch):
|
||||
"Check that retrying the download works if the hash is different"
|
||||
with TemporaryDirectory() as local_store:
|
||||
monkeypatch.setattr(core, "hash_matches", FakeHashMatches(11).hash_matches)
|
||||
fakesleep = FakeSleep()
|
||||
monkeypatch.setattr(core.time, "sleep", fakesleep.sleep)
|
||||
# Setup a pooch with download retrying
|
||||
path = Path(local_store)
|
||||
true_path = str(path / "tiny-data.txt")
|
||||
retries = 11
|
||||
pup = Pooch(
|
||||
path=path, base_url=BASEURL, registry=REGISTRY, retry_if_failed=retries
|
||||
)
|
||||
# Check that the logs say that the download failed n times
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
logs = log_file.getvalue().strip().split("\n")
|
||||
assert len(logs) == 1 + retries
|
||||
assert logs[0].startswith("Downloading")
|
||||
assert logs[0].endswith(f"'{path}'.")
|
||||
for i, line in zip(range(retries, 0, -1), logs[1:]):
|
||||
assert "Failed to download" in line
|
||||
plural = "s" if i > 1 else ""
|
||||
assert f"download again {i} more time{plural}." in line
|
||||
# Check that the sleep time increases but stops at 10s
|
||||
assert fakesleep.times == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10]
|
||||
# Check that the downloaded file has the right content
|
||||
assert true_path == fname
|
||||
check_tiny_data(fname)
|
||||
assert file_hash(fname) == REGISTRY["tiny-data.txt"]
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_pooch_download_retry_fails_eventually(monkeypatch):
|
||||
"Check that retrying the download fails after the set amount of retries"
|
||||
with TemporaryDirectory() as local_store:
|
||||
monkeypatch.setattr(core, "hash_matches", FakeHashMatches(3).hash_matches)
|
||||
# Setup a pooch with insufficient retry attempts
|
||||
path = Path(local_store)
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY, retry_if_failed=1)
|
||||
# Make sure it fails with no retries
|
||||
with pytest.raises(ValueError) as error:
|
||||
# Check that the logs say that the download failed n times
|
||||
with capture_log() as log_file:
|
||||
pup.fetch("tiny-data.txt")
|
||||
logs = log_file.getvalue().strip().split("\n")
|
||||
assert len(logs) == 2
|
||||
assert logs[0].startswith("Downloading")
|
||||
assert logs[0].endswith(f"'{path}'.")
|
||||
assert "Failed to download" in logs[1]
|
||||
assert "download again 1 more time." in logs[1]
|
||||
assert "does not match the known hash" in str(error)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_pooch_logging_level():
|
||||
"Setup a pooch and check that no logging happens when the level is raised"
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
urls = {"tiny-data.txt": BASEURL + "tiny-data.txt"}
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(path=path, base_url="", registry=REGISTRY, urls=urls)
|
||||
# Capture only critical logging events
|
||||
with capture_log("CRITICAL") as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
assert log_file.getvalue() == ""
|
||||
check_tiny_data(fname)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_pooch_update():
|
||||
"Setup a pooch that already has the local data but the file is outdated"
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
# Create a dummy version of tiny-data.txt that is different from the
|
||||
# one in the remote storage
|
||||
true_path = str(path / "tiny-data.txt")
|
||||
with open(true_path, "w", encoding="utf-8") as fin:
|
||||
fin.write("different data")
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
|
||||
# Check that the logs say that the file is being updated
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Updating"
|
||||
assert logs.split()[-1] == f"'{path}'."
|
||||
# Check that the updated file has the right content
|
||||
assert true_path == fname
|
||||
check_tiny_data(fname)
|
||||
assert file_hash(fname) == REGISTRY["tiny-data.txt"]
|
||||
# Check that no logging happens when not downloading
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
assert log_file.getvalue() == ""
|
||||
|
||||
|
||||
def test_pooch_update_disallowed():
|
||||
"Test that disallowing updates works."
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
# Create a dummy version of tiny-data.txt that is different from the
|
||||
# one in the remote storage
|
||||
true_path = str(path / "tiny-data.txt")
|
||||
with open(true_path, "w", encoding="utf-8") as fin:
|
||||
fin.write("different data")
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(
|
||||
path=path,
|
||||
base_url=BASEURL,
|
||||
registry=REGISTRY,
|
||||
allow_updates=False,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
pup.fetch("tiny-data.txt")
|
||||
|
||||
|
||||
def test_pooch_update_disallowed_environment():
|
||||
"Test that disallowing updates works through an environment variable."
|
||||
variable_name = "MYPROJECT_DISALLOW_UPDATES"
|
||||
try:
|
||||
os.environ[variable_name] = "False"
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
# Create a dummy version of tiny-data.txt that is different from
|
||||
# the one in the remote storage
|
||||
true_path = str(path / "tiny-data.txt")
|
||||
with open(true_path, "w", encoding="utf-8") as fin:
|
||||
fin.write("different data")
|
||||
# Setup a pooch in a temp dir
|
||||
pup = create(
|
||||
path=path,
|
||||
base_url=BASEURL,
|
||||
registry=REGISTRY,
|
||||
allow_updates=variable_name,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
pup.fetch("tiny-data.txt")
|
||||
finally:
|
||||
os.environ.pop(variable_name)
|
||||
|
||||
|
||||
def test_pooch_create_base_url_no_trailing_slash():
|
||||
"""
|
||||
Test if pooch.create appends a trailing slash to the base url if missing
|
||||
"""
|
||||
base_url = "https://mybase.url"
|
||||
pup = create(base_url=base_url, registry=None, path=DATA_DIR)
|
||||
assert pup.base_url == base_url + "/"
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_pooch_corrupted(data_dir_mirror):
|
||||
"Raise an exception if the file hash doesn't match the registry"
|
||||
# Test the case where the file wasn't in the directory
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = os.path.abspath(local_store)
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY_CORRUPTED)
|
||||
with capture_log() as log_file:
|
||||
with pytest.raises(ValueError) as error:
|
||||
pup.fetch("tiny-data.txt")
|
||||
assert "(tiny-data.txt)" in str(error.value)
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Downloading"
|
||||
assert logs.split()[-1] == f"'{path}'."
|
||||
# and the case where the file exists but hash doesn't match
|
||||
pup = Pooch(path=data_dir_mirror, base_url=BASEURL, registry=REGISTRY_CORRUPTED)
|
||||
with capture_log() as log_file:
|
||||
with pytest.raises(ValueError) as error:
|
||||
pup.fetch("tiny-data.txt")
|
||||
assert "(tiny-data.txt)" in str(error.value)
|
||||
logs = log_file.getvalue()
|
||||
assert logs.split()[0] == "Updating"
|
||||
assert logs.split()[-1] == f"'{data_dir_mirror}'."
|
||||
|
||||
|
||||
def test_pooch_file_not_in_registry():
|
||||
"Should raise an exception if the file is not in the registry."
|
||||
pup = Pooch(
|
||||
path="it shouldn't matter", base_url="this shouldn't either", registry=REGISTRY
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
pup.fetch("this-file-does-not-exit.csv")
|
||||
|
||||
|
||||
def test_pooch_load_registry():
|
||||
"Loading the registry from a file should work"
|
||||
pup = Pooch(path="", base_url="")
|
||||
pup.load_registry(os.path.join(DATA_DIR, "registry.txt"))
|
||||
assert pup.registry == REGISTRY
|
||||
assert pup.registry_files.sort() == list(REGISTRY).sort()
|
||||
|
||||
|
||||
def test_pooch_load_registry_comments():
|
||||
"Loading the registry from a file and strip line comments"
|
||||
pup = Pooch(path="", base_url="")
|
||||
pup.load_registry(os.path.join(DATA_DIR, "registry_comments.txt"))
|
||||
assert pup.registry == REGISTRY
|
||||
assert pup.registry_files.sort() == list(REGISTRY).sort()
|
||||
|
||||
|
||||
def test_pooch_load_registry_fileobj():
|
||||
"Loading the registry from a file object"
|
||||
path = os.path.join(DATA_DIR, "registry.txt")
|
||||
|
||||
# Binary mode
|
||||
pup = Pooch(path="", base_url="")
|
||||
with open(path, "rb") as fin:
|
||||
pup.load_registry(fin)
|
||||
assert pup.registry == REGISTRY
|
||||
assert pup.registry_files.sort() == list(REGISTRY).sort()
|
||||
|
||||
# Text mode
|
||||
pup = Pooch(path="", base_url="")
|
||||
with open(path, "r", encoding="utf-8") as fin:
|
||||
pup.load_registry(fin)
|
||||
assert pup.registry == REGISTRY
|
||||
assert pup.registry_files.sort() == list(REGISTRY).sort()
|
||||
|
||||
|
||||
def test_pooch_load_registry_custom_url():
|
||||
"Load the registry from a file with a custom URL inserted"
|
||||
pup = Pooch(path="", base_url="")
|
||||
pup.load_registry(os.path.join(DATA_DIR, "registry-custom-url.txt"))
|
||||
assert pup.registry == REGISTRY
|
||||
assert pup.urls == {"tiny-data.txt": "https://some-site/tiny-data.txt"}
|
||||
|
||||
|
||||
def test_pooch_load_registry_invalid_line():
|
||||
"Should raise an exception when a line doesn't have two elements"
|
||||
pup = Pooch(path="", base_url="", registry={})
|
||||
with pytest.raises(IOError):
|
||||
pup.load_registry(os.path.join(DATA_DIR, "registry-invalid.txt"))
|
||||
|
||||
|
||||
def test_pooch_load_registry_with_spaces():
|
||||
"Should check that spaces in filenames are allowed in registry files"
|
||||
pup = Pooch(path="", base_url="")
|
||||
pup.load_registry(os.path.join(DATA_DIR, "registry-spaces.txt"))
|
||||
assert "file with spaces.txt" in pup.registry
|
||||
assert "other with spaces.txt" in pup.registry
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_check_availability():
|
||||
"Should correctly check availability of existing and non existing files"
|
||||
# Check available remote file
|
||||
pup = Pooch(path=DATA_DIR, base_url=BASEURL, registry=REGISTRY)
|
||||
assert pup.is_available("tiny-data.txt")
|
||||
# Check non available remote file
|
||||
pup = Pooch(path=DATA_DIR, base_url=BASEURL + "wrong-url/", registry=REGISTRY)
|
||||
assert not pup.is_available("tiny-data.txt")
|
||||
# Wrong file name
|
||||
registry = {"not-a-real-data-file.txt": "notarealhash"}
|
||||
registry.update(REGISTRY)
|
||||
pup = Pooch(path=DATA_DIR, base_url=BASEURL, registry=registry)
|
||||
assert not pup.is_available("not-a-real-data-file.txt")
|
||||
|
||||
|
||||
def test_check_availability_on_ftp(ftpserver):
|
||||
"Should correctly check availability of existing and non existing files"
|
||||
with data_over_ftp(ftpserver, "tiny-data.txt") as url:
|
||||
# Check available remote file on FTP server
|
||||
pup = Pooch(
|
||||
path=DATA_DIR,
|
||||
base_url=url.replace("tiny-data.txt", ""),
|
||||
registry={
|
||||
"tiny-data.txt": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
|
||||
"doesnot_exist.zip": "jdjdjdjdflld",
|
||||
},
|
||||
)
|
||||
downloader = FTPDownloader(port=ftpserver.server_port)
|
||||
assert pup.is_available("tiny-data.txt", downloader=downloader)
|
||||
# Check non available remote file
|
||||
assert not pup.is_available("doesnot_exist.zip", downloader=downloader)
|
||||
|
||||
|
||||
def test_check_availability_invalid_downloader():
|
||||
"Should raise an exception if the downloader doesn't support this"
|
||||
|
||||
def downloader(url, output, pooch): # pylint: disable=unused-argument
|
||||
"A downloader that doesn't support check_only"
|
||||
return None
|
||||
|
||||
pup = Pooch(path=DATA_DIR, base_url=BASEURL, registry=REGISTRY)
|
||||
msg = "does not support availability checks."
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
pup.is_available("tiny-data.txt", downloader=downloader)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_fetch_with_downloader(capsys):
|
||||
"Setup a downloader function for fetch"
|
||||
|
||||
def download(url, output_file, pup): # pylint: disable=unused-argument
|
||||
"Download through HTTP and warn that we're doing it"
|
||||
get_logger().info("downloader executed")
|
||||
HTTPDownloader()(url, output_file, pup)
|
||||
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
|
||||
# Check that the logs say that the file is being downloaded
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("large-data.txt", downloader=download)
|
||||
logs = log_file.getvalue()
|
||||
lines = logs.splitlines()
|
||||
assert len(lines) == 2
|
||||
assert lines[0].split()[0] == "Downloading"
|
||||
assert lines[1] == "downloader executed"
|
||||
# Read stderr and make sure no progress bar was printed by default
|
||||
assert not capsys.readouterr().err
|
||||
# Check that the downloaded file has the right content
|
||||
check_large_data(fname)
|
||||
# Check that no logging happens when not downloading
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("large-data.txt")
|
||||
assert log_file.getvalue() == ""
|
||||
|
||||
|
||||
def test_invalid_hash_alg(data_dir_mirror):
|
||||
"Test an invalid hashing algorithm"
|
||||
pup = Pooch(
|
||||
path=data_dir_mirror, base_url=BASEURL, registry={"tiny-data.txt": "blah:1234"}
|
||||
)
|
||||
with pytest.raises(ValueError) as exc:
|
||||
pup.fetch("tiny-data.txt")
|
||||
|
||||
assert "'blah'" in str(exc.value)
|
||||
|
||||
|
||||
def test_alternative_hashing_algorithms(data_dir_mirror):
|
||||
"Test different hashing algorithms using local data"
|
||||
fname = str(data_dir_mirror / "tiny-data.txt")
|
||||
check_tiny_data(fname)
|
||||
with open(fname, "rb") as fin:
|
||||
data = fin.read()
|
||||
for alg in ("sha512", "md5"):
|
||||
hasher = hashlib.new(alg)
|
||||
hasher.update(data)
|
||||
registry = {"tiny-data.txt": f"{alg}:{hasher.hexdigest()}"}
|
||||
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry=registry)
|
||||
assert fname == pup.fetch("tiny-data.txt")
|
||||
check_tiny_data(fname)
|
||||
|
||||
|
||||
def test_download_action():
|
||||
"Test that the right action is performed based on file existing"
|
||||
action, verb = download_action(
|
||||
Path("this_file_does_not_exist.txt"), known_hash=None
|
||||
)
|
||||
assert action == "download"
|
||||
assert verb == "Downloading"
|
||||
|
||||
with temporary_file() as tmp:
|
||||
action, verb = download_action(Path(tmp), known_hash="not the correct hash")
|
||||
assert action == "update"
|
||||
assert verb == "Updating"
|
||||
|
||||
with temporary_file() as tmp:
|
||||
with open(tmp, "w", encoding="utf-8") as output:
|
||||
output.write("some data")
|
||||
action, verb = download_action(Path(tmp), known_hash=file_hash(tmp))
|
||||
assert action == "fetch"
|
||||
assert verb == "Fetching"
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize("fname", ["tiny-data.txt", "subdir/tiny-data.txt"])
|
||||
def test_stream_download(fname):
|
||||
"Check that downloading a file over HTTP works as expected"
|
||||
# Use the data in store/ because the subdir is in there for some reason
|
||||
url = BASEURL + "store/" + fname
|
||||
known_hash = REGISTRY[fname]
|
||||
downloader = HTTPDownloader()
|
||||
with TemporaryDirectory() as local_store:
|
||||
destination = Path(local_store) / fname
|
||||
assert not destination.exists()
|
||||
stream_download(url, destination, known_hash, downloader, pooch=None)
|
||||
assert destination.exists()
|
||||
check_tiny_data(str(destination))
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[pytest.param(FIGSHAREURL, marks=pytest.mark.figshare), ZENODOURL, DATAVERSEURL],
|
||||
ids=["figshare", "zenodo", "dataverse"],
|
||||
)
|
||||
def test_load_registry_from_doi(url):
|
||||
"""Check that the registry is correctly populated from the API"""
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = os.path.abspath(local_store)
|
||||
pup = Pooch(path=path, base_url=url)
|
||||
pup.load_registry_from_doi()
|
||||
|
||||
# Check the existence of all files in the registry
|
||||
assert len(pup.registry) == 2
|
||||
assert "tiny-data.txt" in pup.registry
|
||||
assert "store.zip" in pup.registry
|
||||
|
||||
# Ensure that all files have correct checksums by fetching them
|
||||
for filename in pup.registry:
|
||||
pup.fetch(filename)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_load_registry_from_doi_zenodo_with_slash():
|
||||
"""
|
||||
Check that the registry is correctly populated from the Zenodo API when
|
||||
the filename contains a slash
|
||||
"""
|
||||
url = ZENODOURL_W_SLASH
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = os.path.abspath(local_store)
|
||||
pup = Pooch(path=path, base_url=url)
|
||||
pup.load_registry_from_doi()
|
||||
|
||||
# Check the existence of all files in the registry
|
||||
assert len(pup.registry) == 1
|
||||
assert "santisoler/pooch-test-data-v1.zip" in pup.registry
|
||||
|
||||
# Ensure that all files have correct checksums by fetching them
|
||||
for filename in pup.registry:
|
||||
pup.fetch(filename)
|
||||
|
||||
|
||||
def test_wrong_load_registry_from_doi():
|
||||
"""Check that non-DOI URLs produce an error"""
|
||||
|
||||
pup = Pooch(path="", base_url=BASEURL)
|
||||
|
||||
with pytest.raises(ValueError) as exc:
|
||||
pup.load_registry_from_doi()
|
||||
|
||||
assert "only implemented for DOIs" in str(exc.value)
|
||||
@@ -0,0 +1,582 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Test the downloader classes and functions separately from the Pooch core.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pytest
|
||||
from requests import HTTPError
|
||||
|
||||
# Mypy doesn't like assigning None like this.
|
||||
# Can just use a guard variable
|
||||
|
||||
try:
|
||||
import tqdm
|
||||
except ImportError:
|
||||
tqdm = None # type: ignore
|
||||
|
||||
try:
|
||||
import paramiko
|
||||
except ImportError:
|
||||
paramiko = None # type: ignore
|
||||
|
||||
from .. import Pooch
|
||||
from ..downloaders import (
|
||||
HTTPDownloader,
|
||||
FTPDownloader,
|
||||
SFTPDownloader,
|
||||
DOIDownloader,
|
||||
choose_downloader,
|
||||
FigshareRepository,
|
||||
ZenodoRepository,
|
||||
DataverseRepository,
|
||||
doi_to_url,
|
||||
REQUESTS_HEADERS,
|
||||
)
|
||||
from ..processors import Unzip
|
||||
from .utils import (
|
||||
pooch_test_url,
|
||||
check_large_data,
|
||||
check_tiny_data,
|
||||
data_over_ftp,
|
||||
pooch_test_figshare_url,
|
||||
pooch_test_zenodo_url,
|
||||
pooch_test_zenodo_with_slash_url,
|
||||
pooch_test_dataverse_url,
|
||||
)
|
||||
|
||||
|
||||
BASEURL = pooch_test_url()
|
||||
FIGSHAREURL = pooch_test_figshare_url()
|
||||
ZENODOURL = pooch_test_zenodo_url()
|
||||
ZENODOURL_W_SLASH = pooch_test_zenodo_with_slash_url()
|
||||
DATAVERSEURL = pooch_test_dataverse_url()
|
||||
|
||||
|
||||
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
BASEURL + "tiny-data.txt", # HTTPDownloader
|
||||
ZENODOURL, # DOIDownloader
|
||||
],
|
||||
)
|
||||
def test_progressbar_kwarg_passed(url):
|
||||
"""The progressbar keyword argument must pass through choose_downloader"""
|
||||
downloader = choose_downloader(url, progressbar=True)
|
||||
assert downloader.progressbar is True
|
||||
|
||||
|
||||
@pytest.mark.skipif(paramiko is None, reason="requires paramiko")
|
||||
def test_progressbar_kwarg_passed_sftp():
|
||||
"""The progressbar keyword argument must pass through choose_downloader"""
|
||||
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
|
||||
downloader = choose_downloader(url, progressbar=True)
|
||||
assert downloader.progressbar is True
|
||||
|
||||
|
||||
def test_unsupported_protocol():
|
||||
"Should raise ValueError when protocol is not supported"
|
||||
with pytest.raises(ValueError):
|
||||
choose_downloader("httpup://some-invalid-url.com")
|
||||
# Simulate the DOI format
|
||||
with pytest.raises(ValueError):
|
||||
choose_downloader("doii:XXX/XXX/file")
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_invalid_doi_repository():
|
||||
"Should fail if data repository is not supported"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
# Use the DOI of the Pooch paper in JOSS (not a data repository)
|
||||
DOIDownloader()(
|
||||
url="doi:10.21105/joss.01943/file_name.txt", output_file=None, pooch=None
|
||||
)
|
||||
assert "Invalid data repository 'joss.theoj.org'" in str(exc.value)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_doi_url_not_found():
|
||||
"Should fail if the DOI is not found"
|
||||
with pytest.raises(HTTPError):
|
||||
doi_to_url(doi="NOTAREALDOI")
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"repository,doi",
|
||||
[
|
||||
pytest.param(
|
||||
FigshareRepository,
|
||||
"10.6084/m9.figshare.14763051.v1",
|
||||
marks=pytest.mark.figshare,
|
||||
),
|
||||
(ZenodoRepository, "10.5281/zenodo.4924875"),
|
||||
(DataverseRepository, "10.11588/data/TKCFEF"),
|
||||
],
|
||||
ids=["figshare", "zenodo", "dataverse"],
|
||||
)
|
||||
def test_figshare_url_file_not_found(repository, doi):
|
||||
"Should fail if the file is not found in the archive"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
url = doi_to_url(doi)
|
||||
repo = repository.initialize(doi, url)
|
||||
repo.download_url(file_name="bla.txt")
|
||||
assert "File 'bla.txt' not found" in str(exc.value)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[pytest.param(FIGSHAREURL, marks=pytest.mark.figshare), ZENODOURL, DATAVERSEURL],
|
||||
ids=["figshare", "zenodo", "dataverse"],
|
||||
)
|
||||
def test_doi_downloader(url):
|
||||
"Test the DOI downloader"
|
||||
# Use the test data we have on the repository
|
||||
with TemporaryDirectory() as local_store:
|
||||
downloader = DOIDownloader()
|
||||
outfile = os.path.join(local_store, "tiny-data.txt")
|
||||
downloader(url + "tiny-data.txt", outfile, None)
|
||||
check_tiny_data(outfile)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_zenodo_downloader_with_slash_in_fname():
|
||||
"""
|
||||
Test the Zenodo downloader when the path contains a forward slash
|
||||
|
||||
Related to issue #336
|
||||
"""
|
||||
# Use the test data we have on the repository
|
||||
with TemporaryDirectory() as local_store:
|
||||
base_url = ZENODOURL_W_SLASH + "santisoler/pooch-test-data-v1.zip"
|
||||
downloader = DOIDownloader()
|
||||
outfile = os.path.join(local_store, "test-data.zip")
|
||||
downloader(base_url, outfile, None)
|
||||
# unpack the downloaded zip file so we can check the integrity of
|
||||
# tiny-data.txt
|
||||
fnames = Unzip()(outfile, action="download", pooch=None)
|
||||
(fname,) = [f for f in fnames if "tiny-data.txt" in f]
|
||||
check_tiny_data(fname)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.figshare
|
||||
def test_figshare_unspecified_version():
|
||||
"""
|
||||
Test if passing a Figshare url without a version warns about it, but still
|
||||
downloads it.
|
||||
"""
|
||||
url = FIGSHAREURL
|
||||
# Remove the last bits of the doi, where the version is specified and
|
||||
url = url[: url.rindex(".")] + "/"
|
||||
# Create expected warning message
|
||||
doi = url[4:-1]
|
||||
warning_msg = f"The Figshare DOI '{doi}' doesn't specify which version of "
|
||||
with TemporaryDirectory() as local_store:
|
||||
downloader = DOIDownloader()
|
||||
outfile = os.path.join(local_store, "tiny-data.txt")
|
||||
with pytest.warns(UserWarning, match=warning_msg):
|
||||
downloader(url + "tiny-data.txt", outfile, None)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.figshare
|
||||
@pytest.mark.parametrize(
|
||||
"version, missing, present",
|
||||
[
|
||||
(
|
||||
1,
|
||||
"LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz",
|
||||
"cropped-before.tar.gz",
|
||||
),
|
||||
(
|
||||
2,
|
||||
"cropped-before.tar.gz",
|
||||
"LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_figshare_data_repository_versions(version, missing, present):
|
||||
"""
|
||||
Test if setting the version in Figshare DOI works as expected
|
||||
"""
|
||||
# Use a Figshare repo as example (we won't download files from it since
|
||||
# they are too big)
|
||||
doi = f"10.6084/m9.figshare.21665630.v{version}"
|
||||
url = f"https://doi.org/{doi}/"
|
||||
figshare = FigshareRepository(doi, url)
|
||||
filenames = [item["name"] for item in figshare.api_response]
|
||||
assert present in filenames
|
||||
assert missing not in filenames
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_ftp_downloader(ftpserver):
|
||||
"Test ftp downloader"
|
||||
with data_over_ftp(ftpserver, "tiny-data.txt") as url:
|
||||
with TemporaryDirectory() as local_store:
|
||||
downloader = FTPDownloader(port=ftpserver.server_port)
|
||||
outfile = os.path.join(local_store, "tiny-data.txt")
|
||||
downloader(url, outfile, None)
|
||||
check_tiny_data(outfile)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.skipif(paramiko is None, reason="requires paramiko to run SFTP")
|
||||
def test_sftp_downloader():
|
||||
"Test sftp downloader"
|
||||
with TemporaryDirectory() as local_store:
|
||||
downloader = SFTPDownloader(username="demo", password="password")
|
||||
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
|
||||
outfile = os.path.join(local_store, "pocketftp.png")
|
||||
downloader(url, outfile, None)
|
||||
assert os.path.exists(outfile)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.skipif(paramiko is None, reason="requires paramiko to run SFTP")
|
||||
def test_sftp_downloader_fail_if_file_object():
|
||||
"Downloader should fail when a file object rather than string is passed"
|
||||
with TemporaryDirectory() as local_store:
|
||||
downloader = SFTPDownloader(username="demo", password="password")
|
||||
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
|
||||
outfile = os.path.join(local_store, "pocketftp.png")
|
||||
with open(outfile, "wb") as outfile_obj:
|
||||
with pytest.raises(TypeError):
|
||||
downloader(url, outfile_obj, None)
|
||||
|
||||
|
||||
@pytest.mark.skipif(paramiko is not None, reason="paramiko must be missing")
|
||||
def test_sftp_downloader_fail_if_paramiko_missing():
|
||||
"test must fail if paramiko is not installed"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
SFTPDownloader()
|
||||
assert "'paramiko'" in str(exc.value)
|
||||
|
||||
|
||||
@pytest.mark.skipif(tqdm is not None, reason="tqdm must be missing")
|
||||
@pytest.mark.parametrize("downloader", [HTTPDownloader, FTPDownloader, SFTPDownloader])
|
||||
def test_downloader_progressbar_fails(downloader):
|
||||
"Make sure an error is raised if trying to use progressbar without tqdm"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
downloader(progressbar=True)
|
||||
assert "'tqdm'" in str(exc.value)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
|
||||
@pytest.mark.parametrize(
|
||||
"url,downloader",
|
||||
[
|
||||
(BASEURL, HTTPDownloader),
|
||||
pytest.param(FIGSHAREURL, DOIDownloader, marks=pytest.mark.figshare),
|
||||
],
|
||||
ids=["http", "figshare"],
|
||||
)
|
||||
def test_downloader_progressbar(url, downloader, capsys):
|
||||
"Setup a downloader function that prints a progress bar for fetch"
|
||||
download = downloader(progressbar=True)
|
||||
with TemporaryDirectory() as local_store:
|
||||
fname = "tiny-data.txt"
|
||||
url = url + fname
|
||||
outfile = os.path.join(local_store, fname)
|
||||
download(url, outfile, None)
|
||||
# Read stderr and make sure the progress bar is printed only when told
|
||||
captured = capsys.readouterr()
|
||||
printed = captured.err.split("\r")[-1].strip()
|
||||
assert len(printed) == 79
|
||||
if sys.platform == "win32":
|
||||
progress = "100%|####################"
|
||||
else:
|
||||
progress = "100%|████████████████████"
|
||||
# Bar size is not always the same so can't reliably test the whole bar.
|
||||
assert printed[:25] == progress
|
||||
# Check that the downloaded file has the right content
|
||||
check_tiny_data(outfile)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
|
||||
def test_downloader_progressbar_ftp(capsys, ftpserver):
|
||||
"Setup an FTP downloader function that prints a progress bar for fetch"
|
||||
with data_over_ftp(ftpserver, "tiny-data.txt") as url:
|
||||
download = FTPDownloader(progressbar=True, port=ftpserver.server_port)
|
||||
with TemporaryDirectory() as local_store:
|
||||
outfile = os.path.join(local_store, "tiny-data.txt")
|
||||
download(url, outfile, None)
|
||||
# Read stderr and make sure the progress bar is printed only when
|
||||
# told
|
||||
captured = capsys.readouterr()
|
||||
printed = captured.err.split("\r")[-1].strip()
|
||||
assert len(printed) == 79
|
||||
if sys.platform == "win32":
|
||||
progress = "100%|####################"
|
||||
else:
|
||||
progress = "100%|████████████████████"
|
||||
# Bar size is not always the same so can't reliably test the whole
|
||||
# bar.
|
||||
assert printed[:25] == progress
|
||||
# Check that the file was actually downloaded
|
||||
check_tiny_data(outfile)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
|
||||
@pytest.mark.skipif(paramiko is None, reason="requires paramiko")
|
||||
def test_downloader_progressbar_sftp(capsys):
|
||||
"Setup an SFTP downloader function that prints a progress bar for fetch"
|
||||
downloader = SFTPDownloader(progressbar=True, username="demo", password="password")
|
||||
with TemporaryDirectory() as local_store:
|
||||
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
|
||||
outfile = os.path.join(local_store, "pocketftp.png")
|
||||
downloader(url, outfile, None)
|
||||
# Read stderr and make sure the progress bar is printed only when told
|
||||
captured = capsys.readouterr()
|
||||
printed = captured.err.split("\r")[-1].strip()
|
||||
assert len(printed) == 79
|
||||
if sys.platform == "win32":
|
||||
progress = "100%|####################"
|
||||
else:
|
||||
progress = "100%|████████████████████"
|
||||
# Bar size is not always the same so can't reliably test the whole bar.
|
||||
assert printed[:25] == progress
|
||||
# Check that the file was actually downloaded
|
||||
assert os.path.exists(outfile)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_downloader_arbitrary_progressbar(capsys):
|
||||
"Setup a downloader function with an arbitrary progress bar class."
|
||||
|
||||
class MinimalProgressDisplay:
|
||||
"""A minimalist replacement for tqdm.tqdm"""
|
||||
|
||||
def __init__(self, total):
|
||||
self.count = 0
|
||||
self.total = total
|
||||
|
||||
def __repr__(self):
|
||||
"""represent current completion"""
|
||||
return str(self.count) + "/" + str(self.total)
|
||||
|
||||
def render(self):
|
||||
"""print self.__repr__ to stderr"""
|
||||
print(f"\r{self}", file=sys.stderr, end="")
|
||||
|
||||
def update(self, i):
|
||||
"""modify completion and render"""
|
||||
self.count = i
|
||||
self.render()
|
||||
|
||||
def reset(self):
|
||||
"""set counter to 0"""
|
||||
self.count = 0
|
||||
|
||||
@staticmethod
|
||||
def close():
|
||||
"""print a new empty line"""
|
||||
print("", file=sys.stderr)
|
||||
|
||||
pbar = MinimalProgressDisplay(total=None)
|
||||
download = HTTPDownloader(progressbar=pbar)
|
||||
with TemporaryDirectory() as local_store:
|
||||
fname = "large-data.txt"
|
||||
url = BASEURL + fname
|
||||
outfile = os.path.join(local_store, "large-data.txt")
|
||||
download(url, outfile, None)
|
||||
# Read stderr and make sure the progress bar is printed only when told
|
||||
captured = capsys.readouterr()
|
||||
printed = captured.err.split("\r")[-1].strip()
|
||||
|
||||
progress = "336/336"
|
||||
assert printed == progress
|
||||
|
||||
# Check that the downloaded file has the right content
|
||||
check_large_data(outfile)
|
||||
|
||||
|
||||
class TestZenodoAPISupport:
|
||||
"""
|
||||
Test support for different Zenodo APIs
|
||||
"""
|
||||
|
||||
article_id = 123456
|
||||
doi = f"10.0001/zenodo.{article_id}"
|
||||
doi_url = f"https://doi.org/{doi}"
|
||||
file_name = "my-file.zip"
|
||||
file_url = (
|
||||
"https://zenodo.org/api/files/513d7033-93a2-4eeb-821c-2fb0bbab0012/my-file.zip"
|
||||
)
|
||||
file_checksum = "2942bfabb3d05332b66eb128e0842cff"
|
||||
|
||||
legacy_api_response = {
|
||||
"created": "2021-20-19T08:00:00.000000+00:00",
|
||||
"modified": "2021-20-19T08:00:00.000000+00:00",
|
||||
"id": article_id,
|
||||
"doi": doi,
|
||||
"doi_url": doi_url,
|
||||
"files": [
|
||||
{
|
||||
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
|
||||
"key": file_name,
|
||||
"checksum": f"md5:{file_checksum}",
|
||||
"links": {
|
||||
"self": file_url,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
new_api_response = {
|
||||
"created": "2021-20-19T08:00:00.000000+00:00",
|
||||
"modified": "2021-20-19T08:00:00.000000+00:00",
|
||||
"id": article_id,
|
||||
"doi": doi,
|
||||
"doi_url": doi_url,
|
||||
"files": [
|
||||
{
|
||||
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
|
||||
"filename": file_name,
|
||||
"checksum": file_checksum,
|
||||
"links": {
|
||||
"self": file_url,
|
||||
},
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
invalid_api_response = {
|
||||
"created": "2021-20-19T08:00:00.000000+00:00",
|
||||
"modified": "2021-20-19T08:00:00.000000+00:00",
|
||||
"id": article_id,
|
||||
"doi": doi,
|
||||
"doi_url": doi_url,
|
||||
"files": [
|
||||
{
|
||||
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
|
||||
"filename": file_name,
|
||||
"checksum": file_checksum,
|
||||
"links": {
|
||||
"self": file_url,
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
|
||||
"key": file_name,
|
||||
"checksum": f"md5:{file_checksum}",
|
||||
"links": {
|
||||
"self": file_url,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"api_version, api_response",
|
||||
[
|
||||
("legacy", legacy_api_response),
|
||||
("new", new_api_response),
|
||||
("invalid", invalid_api_response),
|
||||
],
|
||||
)
|
||||
def test_api_version(self, httpserver, api_version, api_response):
|
||||
"""
|
||||
Test if the API version is correctly detected.
|
||||
"""
|
||||
# Create a local http server
|
||||
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
|
||||
api_response
|
||||
)
|
||||
# Create Zenodo downloader
|
||||
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
|
||||
# Override base url for the API of the downloader
|
||||
downloader.base_api_url = httpserver.url_for("")
|
||||
# Check if the API version is correctly identified
|
||||
if api_version != "invalid":
|
||||
assert downloader.api_version == api_version
|
||||
else:
|
||||
msg = "Couldn't determine the version of the Zenodo API"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
api_version = downloader.api_version
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"api_version, api_response",
|
||||
[("legacy", legacy_api_response), ("new", new_api_response)],
|
||||
)
|
||||
def test_download_url(self, httpserver, api_version, api_response):
|
||||
"""
|
||||
Test if the download url is correct for each API version.
|
||||
"""
|
||||
# Create a local http server
|
||||
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
|
||||
api_response
|
||||
)
|
||||
# Create Zenodo downloader
|
||||
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
|
||||
# Override base url for the API of the downloader
|
||||
downloader.base_api_url = httpserver.url_for("")
|
||||
# Check if the download url is correct
|
||||
download_url = downloader.download_url(file_name=self.file_name)
|
||||
if api_version == "legacy":
|
||||
assert download_url == self.file_url
|
||||
else:
|
||||
expected_url = (
|
||||
"https://zenodo.org/records/"
|
||||
f"{self.article_id}/files/{self.file_name}?download=1"
|
||||
)
|
||||
assert download_url == expected_url
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"api_response",
|
||||
[legacy_api_response, new_api_response],
|
||||
)
|
||||
def test_populate_registry(self, httpserver, tmp_path, api_response):
|
||||
"""
|
||||
Test if population of registry is correctly done for each API version.
|
||||
"""
|
||||
# Create a local http server
|
||||
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
|
||||
api_response
|
||||
)
|
||||
# Create sample pooch object
|
||||
puppy = Pooch(base_url="", path=tmp_path)
|
||||
# Create Zenodo downloader
|
||||
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
|
||||
# Override base url for the API of the downloader
|
||||
downloader.base_api_url = httpserver.url_for("")
|
||||
# Populate registry
|
||||
downloader.populate_registry(puppy)
|
||||
assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"}
|
||||
|
||||
|
||||
class TestDOIDownloaderHeaders:
|
||||
"""Test the headers argument in DOIDownloader."""
|
||||
|
||||
def test_default_headers(self):
|
||||
"""Test the default value for headers."""
|
||||
downloader = DOIDownloader()
|
||||
assert downloader.headers == REQUESTS_HEADERS
|
||||
downloader = DOIDownloader(headers=None)
|
||||
assert downloader.headers == REQUESTS_HEADERS
|
||||
|
||||
def test_overwrite_headers(self):
|
||||
"""Test overwriting for headers."""
|
||||
downloader = DOIDownloader(headers={"custom": "field"})
|
||||
expected_headers = {
|
||||
"custom": "field",
|
||||
}
|
||||
assert downloader.headers == expected_headers
|
||||
|
||||
def test_headers_empty_dict(self):
|
||||
"""Test passing an emtpy dict to headers."""
|
||||
downloader = DOIDownloader(headers={})
|
||||
assert downloader.headers == {}
|
||||
@@ -0,0 +1,204 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
# pylint: disable=redefined-outer-name
|
||||
"""
|
||||
Test the hash calculation and checking functions.
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
import pytest
|
||||
|
||||
try:
|
||||
import xxhash
|
||||
|
||||
XXHASH_MAJOR_VERSION = int(xxhash.VERSION.split(".", maxsplit=1)[0])
|
||||
except ImportError:
|
||||
xxhash = None # type: ignore[assignment]
|
||||
XXHASH_MAJOR_VERSION = 0
|
||||
|
||||
from ..core import Pooch
|
||||
from ..hashes import (
|
||||
make_registry,
|
||||
file_hash,
|
||||
hash_matches,
|
||||
)
|
||||
from .utils import check_tiny_data, mirror_directory
|
||||
|
||||
DATA_DIR = str(Path(__file__).parent / "data" / "store")
|
||||
REGISTRY = (
|
||||
"tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d\n"
|
||||
)
|
||||
REGISTRY_RECURSIVE = (
|
||||
"subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d\n"
|
||||
"tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d\n"
|
||||
)
|
||||
TINY_DATA_HASHES_HASHLIB = {
|
||||
"sha1": "c03148994acd89317915ea2f2d080d6dd127aa09",
|
||||
"sha256": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
|
||||
"md5": "70e2afd3fd7e336ae478b1e740a5f08e",
|
||||
}
|
||||
TINY_DATA_HASHES_XXH = {
|
||||
"xxh64": "f843815fe57948fa",
|
||||
"xxh32": "98d6f1a2",
|
||||
# Require xxHash > 2.0
|
||||
"xxh128": "0267d220db258fffb0c567c0ecd1b689",
|
||||
"xxh3_128": "0267d220db258fffb0c567c0ecd1b689",
|
||||
"xxh3_64": "811e3f2a12aec53f",
|
||||
}
|
||||
TINY_DATA_HASHES = TINY_DATA_HASHES_HASHLIB.copy()
|
||||
TINY_DATA_HASHES.update(TINY_DATA_HASHES_XXH)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data_dir_mirror(tmp_path):
|
||||
"""
|
||||
Mirror the test data folder on a temporary directory. Needed to avoid
|
||||
permission errors when pooch is installed on a non-writable path.
|
||||
"""
|
||||
return mirror_directory(DATA_DIR, tmp_path)
|
||||
|
||||
|
||||
def test_make_registry(data_dir_mirror):
|
||||
"Check that the registry builder creates the right file names and hashes"
|
||||
outfile = NamedTemporaryFile(delete=False) # pylint: disable=consider-using-with
|
||||
# Need to close the file before writing to it.
|
||||
outfile.close()
|
||||
try:
|
||||
make_registry(data_dir_mirror, outfile.name, recursive=False)
|
||||
with open(outfile.name, encoding="utf-8") as fout:
|
||||
registry = fout.read()
|
||||
assert registry == REGISTRY
|
||||
# Check that the registry can be used.
|
||||
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry={})
|
||||
pup.load_registry(outfile.name)
|
||||
true = str(data_dir_mirror / "tiny-data.txt")
|
||||
fname = pup.fetch("tiny-data.txt")
|
||||
assert true == fname
|
||||
check_tiny_data(fname)
|
||||
finally:
|
||||
os.remove(outfile.name)
|
||||
|
||||
|
||||
def test_make_registry_recursive(data_dir_mirror):
|
||||
"Check that the registry builder works in recursive mode"
|
||||
outfile = NamedTemporaryFile(delete=False) # pylint: disable=consider-using-with
|
||||
# Need to close the file before writing to it.
|
||||
outfile.close()
|
||||
try:
|
||||
make_registry(data_dir_mirror, outfile.name, recursive=True)
|
||||
with open(outfile.name, encoding="utf-8") as fout:
|
||||
registry = fout.read()
|
||||
assert registry == REGISTRY_RECURSIVE
|
||||
# Check that the registry can be used.
|
||||
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry={})
|
||||
pup.load_registry(outfile.name)
|
||||
assert str(data_dir_mirror / "tiny-data.txt") == pup.fetch("tiny-data.txt")
|
||||
check_tiny_data(pup.fetch("tiny-data.txt"))
|
||||
true = str(data_dir_mirror / "subdir" / "tiny-data.txt")
|
||||
assert true == pup.fetch("subdir/tiny-data.txt")
|
||||
check_tiny_data(pup.fetch("subdir/tiny-data.txt"))
|
||||
finally:
|
||||
os.remove(outfile.name)
|
||||
|
||||
|
||||
def test_file_hash_invalid_algorithm():
|
||||
"Test an invalid hashing algorithm"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
file_hash(fname="something", alg="blah")
|
||||
assert "'blah'" in str(exc.value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alg,expected_hash",
|
||||
list(TINY_DATA_HASHES.items()),
|
||||
ids=list(TINY_DATA_HASHES.keys()),
|
||||
)
|
||||
def test_file_hash(alg, expected_hash):
|
||||
"Test the hash calculation using hashlib and xxhash"
|
||||
if alg.startswith("xxh"):
|
||||
if xxhash is None:
|
||||
pytest.skip("requires xxhash")
|
||||
if alg not in ["xxh64", "xxh32"] and XXHASH_MAJOR_VERSION < 2:
|
||||
pytest.skip("requires xxhash > 2.0")
|
||||
fname = os.path.join(DATA_DIR, "tiny-data.txt")
|
||||
check_tiny_data(fname)
|
||||
returned_hash = file_hash(fname, alg)
|
||||
assert returned_hash == expected_hash
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alg,expected_hash",
|
||||
list(TINY_DATA_HASHES.items()),
|
||||
ids=list(TINY_DATA_HASHES.keys()),
|
||||
)
|
||||
def test_hash_matches(alg, expected_hash):
|
||||
"Make sure the hash checking function works"
|
||||
if alg.startswith("xxh"):
|
||||
if xxhash is None:
|
||||
pytest.skip("requires xxhash")
|
||||
if alg not in ["xxh64", "xxh32"] and XXHASH_MAJOR_VERSION < 2:
|
||||
pytest.skip("requires xxhash > 2.0")
|
||||
fname = os.path.join(DATA_DIR, "tiny-data.txt")
|
||||
check_tiny_data(fname)
|
||||
# Check if the check passes
|
||||
known_hash = f"{alg}:{expected_hash}"
|
||||
assert hash_matches(fname, known_hash)
|
||||
# And also if it fails
|
||||
known_hash = f"{alg}:blablablabla"
|
||||
assert not hash_matches(fname, known_hash)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alg,expected_hash",
|
||||
list(TINY_DATA_HASHES_HASHLIB.items()),
|
||||
ids=list(TINY_DATA_HASHES_HASHLIB.keys()),
|
||||
)
|
||||
def test_hash_matches_strict(alg, expected_hash):
|
||||
"Make sure the hash checking function raises an exception if strict"
|
||||
fname = os.path.join(DATA_DIR, "tiny-data.txt")
|
||||
check_tiny_data(fname)
|
||||
# Check if the check passes
|
||||
known_hash = f"{alg}:{expected_hash}"
|
||||
assert hash_matches(fname, known_hash, strict=True)
|
||||
# And also if it fails
|
||||
bad_hash = f"{alg}:blablablabla"
|
||||
with pytest.raises(ValueError) as error:
|
||||
hash_matches(fname, bad_hash, strict=True, source="Neverland")
|
||||
assert "Neverland" in str(error.value)
|
||||
with pytest.raises(ValueError) as error:
|
||||
hash_matches(fname, bad_hash, strict=True, source=None)
|
||||
assert fname in str(error.value)
|
||||
|
||||
|
||||
def test_hash_matches_none():
|
||||
"The hash checking function should always returns True if known_hash=None"
|
||||
fname = os.path.join(DATA_DIR, "tiny-data.txt")
|
||||
assert hash_matches(fname, known_hash=None)
|
||||
# Should work even if the file is invalid
|
||||
assert hash_matches(fname="", known_hash=None)
|
||||
# strict should cause an error if this wasn't working
|
||||
assert hash_matches(fname, known_hash=None, strict=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"alg,expected_hash",
|
||||
list(TINY_DATA_HASHES_HASHLIB.items()),
|
||||
ids=list(TINY_DATA_HASHES_HASHLIB.keys()),
|
||||
)
|
||||
def test_hash_matches_uppercase(alg, expected_hash):
|
||||
"Hash matching should be independent of upper or lower case"
|
||||
fname = os.path.join(DATA_DIR, "tiny-data.txt")
|
||||
check_tiny_data(fname)
|
||||
# Check if the check passes
|
||||
known_hash = f"{alg}:{expected_hash.upper()}"
|
||||
assert hash_matches(fname, known_hash, strict=True)
|
||||
# And also if it fails
|
||||
with pytest.raises(ValueError) as error:
|
||||
hash_matches(fname, known_hash[:-5], strict=True, source="Neverland")
|
||||
assert "Neverland" in str(error.value)
|
||||
@@ -0,0 +1,49 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
# pylint: disable=redefined-outer-name
|
||||
"""
|
||||
Test the entire process of creating a Pooch and using it.
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import create, os_cache
|
||||
from .. import __version__ as full_version
|
||||
from .utils import check_tiny_data, capture_log
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_create_and_fetch():
|
||||
"Fetch a data file from the local storage"
|
||||
path = os_cache("pooch-testing")
|
||||
if path.exists():
|
||||
shutil.rmtree(str(path))
|
||||
pup = create(
|
||||
path=path,
|
||||
base_url="https://github.com/fatiando/pooch/raw/{version}/data/",
|
||||
version=full_version,
|
||||
version_dev="main",
|
||||
env="POOCH_DATA_DIR",
|
||||
)
|
||||
# Make sure the storage isn't created until a download is required
|
||||
assert not pup.abspath.exists()
|
||||
pup.load_registry(Path(os.path.dirname(__file__), "data", "registry.txt"))
|
||||
for target in ["tiny-data.txt", "subdir/tiny-data.txt"]:
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch(target)
|
||||
assert log_file.getvalue().split()[0] == "Downloading"
|
||||
check_tiny_data(fname)
|
||||
# Now modify the file to trigger an update on the next fetch
|
||||
with open(fname, "w", encoding="utf-8") as fin:
|
||||
fin.write("The data is now different")
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch(target)
|
||||
assert log_file.getvalue().split()[0] == "Updating"
|
||||
check_tiny_data(fname)
|
||||
@@ -0,0 +1,289 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Test the processor hooks
|
||||
"""
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
|
||||
from .. import Pooch
|
||||
from ..processors import Unzip, Untar, Decompress
|
||||
|
||||
from .utils import pooch_test_url, pooch_test_registry, check_tiny_data, capture_log
|
||||
|
||||
|
||||
REGISTRY = pooch_test_registry()
|
||||
BASEURL = pooch_test_url()
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"method,ext,name",
|
||||
[
|
||||
("auto", "xz", None),
|
||||
("lzma", "xz", None),
|
||||
("xz", "xz", None),
|
||||
("bzip2", "bz2", None),
|
||||
("gzip", "gz", None),
|
||||
("gzip", "gz", "different-name.txt"),
|
||||
],
|
||||
ids=["auto", "lzma", "xz", "bz2", "gz", "name"],
|
||||
)
|
||||
def test_decompress(method, ext, name):
|
||||
"Check that decompression after download works for all formats"
|
||||
processor = Decompress(method=method, name=name)
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
if name is None:
|
||||
true_path = str(path / ".".join(["tiny-data.txt", ext, "decomp"]))
|
||||
else:
|
||||
true_path = str(path / name)
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
|
||||
# Check the logs when downloading and from the processor
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt." + ext, processor=processor)
|
||||
logs = log_file.getvalue()
|
||||
lines = logs.splitlines()
|
||||
assert len(lines) == 2
|
||||
assert lines[0].split()[0] == "Downloading"
|
||||
assert lines[-1].startswith("Decompressing")
|
||||
assert method in lines[-1]
|
||||
assert fname == true_path
|
||||
check_tiny_data(fname)
|
||||
# Check that processor doesn't execute when not downloading
|
||||
with capture_log() as log_file:
|
||||
fname = pup.fetch("tiny-data.txt." + ext, processor=processor)
|
||||
assert log_file.getvalue() == ""
|
||||
assert fname == true_path
|
||||
check_tiny_data(fname)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
def test_decompress_fails():
|
||||
"Should fail if method='auto' and no extension is given in the file name"
|
||||
with TemporaryDirectory() as local_store:
|
||||
path = Path(local_store)
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
|
||||
# Invalid extension
|
||||
with pytest.raises(ValueError) as exception:
|
||||
with warnings.catch_warnings():
|
||||
pup.fetch("tiny-data.txt", processor=Decompress(method="auto"))
|
||||
assert exception.value.args[0].startswith("Unrecognized file extension '.txt'")
|
||||
assert "pooch.Unzip/Untar" not in exception.value.args[0]
|
||||
# Should also fail for a bad method name
|
||||
with pytest.raises(ValueError) as exception:
|
||||
with warnings.catch_warnings():
|
||||
pup.fetch("tiny-data.txt", processor=Decompress(method="bla"))
|
||||
assert exception.value.args[0].startswith("Invalid compression method 'bla'")
|
||||
assert "pooch.Unzip/Untar" not in exception.value.args[0]
|
||||
# Point people to Untar and Unzip
|
||||
with pytest.raises(ValueError) as exception:
|
||||
with warnings.catch_warnings():
|
||||
pup.fetch("tiny-data.txt", processor=Decompress(method="zip"))
|
||||
assert exception.value.args[0].startswith("Invalid compression method 'zip'")
|
||||
assert "pooch.Unzip/Untar" in exception.value.args[0]
|
||||
with pytest.raises(ValueError) as exception:
|
||||
with warnings.catch_warnings():
|
||||
pup.fetch("store.zip", processor=Decompress(method="auto"))
|
||||
assert exception.value.args[0].startswith("Unrecognized file extension '.zip'")
|
||||
assert "pooch.Unzip/Untar" in exception.value.args[0]
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"target_path", [None, "some_custom_path"], ids=["default_path", "custom_path"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"archive,members",
|
||||
[
|
||||
("tiny-data", ["tiny-data.txt"]),
|
||||
("store", None),
|
||||
("store", ["store/tiny-data.txt"]),
|
||||
("store", ["store/subdir/tiny-data.txt"]),
|
||||
("store", ["store/subdir"]),
|
||||
("store", ["store/tiny-data.txt", "store/subdir"]),
|
||||
],
|
||||
ids=[
|
||||
"single_file",
|
||||
"archive_all",
|
||||
"archive_file",
|
||||
"archive_subdir_file",
|
||||
"archive_subdir",
|
||||
"archive_multiple",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"processor_class,extension",
|
||||
[(Unzip, ".zip"), (Untar, ".tar.gz")],
|
||||
ids=["Unzip", "Untar"],
|
||||
)
|
||||
def test_unpacking(processor_class, extension, target_path, archive, members):
|
||||
"Tests the behaviour of processors for unpacking archives (Untar, Unzip)"
|
||||
processor = processor_class(members=members, extract_dir=target_path)
|
||||
if target_path is None:
|
||||
target_path = archive + extension + processor.suffix
|
||||
with TemporaryDirectory() as path:
|
||||
path = Path(path)
|
||||
true_paths, expected_log = _unpacking_expected_paths_and_logs(
|
||||
archive, members, path / target_path, processor_class.__name__
|
||||
)
|
||||
# Setup a pooch in a temp dir
|
||||
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
|
||||
# Capture logs and check for the right processor message
|
||||
with capture_log() as log_file:
|
||||
fnames = pup.fetch(archive + extension, processor=processor)
|
||||
assert set(fnames) == true_paths
|
||||
_check_logs(log_file, expected_log)
|
||||
for fname in fnames:
|
||||
check_tiny_data(fname)
|
||||
# Check that processor doesn't execute when not downloading
|
||||
with capture_log() as log_file:
|
||||
fnames = pup.fetch(archive + extension, processor=processor)
|
||||
assert set(fnames) == true_paths
|
||||
_check_logs(log_file, [])
|
||||
for fname in fnames:
|
||||
check_tiny_data(fname)
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"processor_class,extension",
|
||||
[(Unzip, ".zip"), (Untar, ".tar.gz")],
|
||||
)
|
||||
def test_multiple_unpacking(processor_class, extension):
|
||||
"Test that multiple subsequent calls to a processor yield correct results"
|
||||
|
||||
with TemporaryDirectory() as local_store:
|
||||
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
|
||||
|
||||
# Do a first fetch with the one member only
|
||||
processor1 = processor_class(members=["store/tiny-data.txt"])
|
||||
filenames1 = pup.fetch("store" + extension, processor=processor1)
|
||||
assert len(filenames1) == 1
|
||||
check_tiny_data(filenames1[0])
|
||||
|
||||
# Do a second fetch with the other member
|
||||
processor2 = processor_class(
|
||||
members=["store/tiny-data.txt", "store/subdir/tiny-data.txt"]
|
||||
)
|
||||
filenames2 = pup.fetch("store" + extension, processor=processor2)
|
||||
assert len(filenames2) == 2
|
||||
check_tiny_data(filenames2[0])
|
||||
check_tiny_data(filenames2[1])
|
||||
|
||||
# Do a third fetch, again with one member and assert
|
||||
# that only this member was returned
|
||||
filenames3 = pup.fetch("store" + extension, processor=processor1)
|
||||
assert len(filenames3) == 1
|
||||
check_tiny_data(filenames3[0])
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"processor_class,extension",
|
||||
[(Unzip, ".zip"), (Untar, ".tar.gz")],
|
||||
)
|
||||
def test_unpack_members_with_leading_dot(processor_class, extension):
|
||||
"Test that unpack members can also be specifed both with a leading ./"
|
||||
|
||||
with TemporaryDirectory() as local_store:
|
||||
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
|
||||
|
||||
# Do a first fetch with the one member only
|
||||
processor1 = processor_class(members=["./store/tiny-data.txt"])
|
||||
filenames1 = pup.fetch("store" + extension, processor=processor1)
|
||||
assert len(filenames1) == 1
|
||||
check_tiny_data(filenames1[0])
|
||||
|
||||
|
||||
def _check_logs(log_file, expected_lines):
|
||||
"""
|
||||
Assert that the lines in the log match the expected ones.
|
||||
"""
|
||||
lines = log_file.getvalue().splitlines()
|
||||
assert len(lines) == len(expected_lines)
|
||||
for line, expected_line in zip(lines, expected_lines):
|
||||
assert line.startswith(expected_line)
|
||||
|
||||
|
||||
def _unpacking_expected_paths_and_logs(archive, members, path, name):
|
||||
"""
|
||||
Generate the appropriate expected paths and log message depending on the
|
||||
parameters for the test.
|
||||
"""
|
||||
log_lines = ["Downloading"]
|
||||
if archive == "tiny-data":
|
||||
true_paths = {str(path / "tiny-data.txt")}
|
||||
log_lines.append("Extracting 'tiny-data.txt'")
|
||||
elif archive == "store" and members is None:
|
||||
true_paths = {
|
||||
str(path / "store" / "tiny-data.txt"),
|
||||
str(path / "store" / "subdir" / "tiny-data.txt"),
|
||||
}
|
||||
log_lines.append(f"{name}{name[-1]}ing contents")
|
||||
elif archive == "store" and members is not None:
|
||||
true_paths = []
|
||||
for member in members:
|
||||
true_path = path / Path(*member.split("/"))
|
||||
if not str(true_path).endswith("tiny-data.txt"):
|
||||
true_path = true_path / "tiny-data.txt"
|
||||
true_paths.append(str(true_path))
|
||||
log_lines.append(f"Extracting '{member}'")
|
||||
true_paths = set(true_paths)
|
||||
return true_paths, log_lines
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"processor_class,extension",
|
||||
[(Unzip, ".zip"), (Untar, ".tar.gz")],
|
||||
)
|
||||
def test_unpacking_members_then_no_members(processor_class, extension):
|
||||
"""
|
||||
Test that calling with valid members then without them works.
|
||||
https://github.com/fatiando/pooch/issues/364
|
||||
"""
|
||||
with TemporaryDirectory() as local_store:
|
||||
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
|
||||
|
||||
# Do a first fetch with an existing member
|
||||
processor1 = processor_class(members=["store/tiny-data.txt"])
|
||||
filenames1 = pup.fetch("store" + extension, processor=processor1)
|
||||
assert len(filenames1) == 1
|
||||
|
||||
# Do a second fetch with no members
|
||||
processor2 = processor_class()
|
||||
filenames2 = pup.fetch("store" + extension, processor=processor2)
|
||||
assert len(filenames2) > 1
|
||||
|
||||
|
||||
@pytest.mark.network
|
||||
@pytest.mark.parametrize(
|
||||
"processor_class,extension",
|
||||
[(Unzip, ".zip"), (Untar, ".tar.gz")],
|
||||
)
|
||||
def test_unpacking_wrong_members_then_no_members(processor_class, extension):
|
||||
"""
|
||||
Test that calling with invalid members then without them works.
|
||||
https://github.com/fatiando/pooch/issues/364
|
||||
"""
|
||||
with TemporaryDirectory() as local_store:
|
||||
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
|
||||
|
||||
# Do a first fetch with incorrect member
|
||||
processor1 = processor_class(members=["not-a-valid-file.csv"])
|
||||
filenames1 = pup.fetch("store" + extension, processor=processor1)
|
||||
assert len(filenames1) == 0
|
||||
|
||||
# Do a second fetch with no members
|
||||
processor2 = processor_class()
|
||||
filenames2 = pup.fetch("store" + extension, processor=processor2)
|
||||
assert len(filenames2) > 0
|
||||
@@ -0,0 +1,197 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Test the utility functions.
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
from tempfile import TemporaryDirectory
|
||||
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
|
||||
|
||||
import pytest
|
||||
|
||||
from ..utils import (
|
||||
parse_url,
|
||||
make_local_storage,
|
||||
temporary_file,
|
||||
unique_file_name,
|
||||
)
|
||||
|
||||
|
||||
def test_unique_name_long():
|
||||
"The file name should never be longer than 255 characters"
|
||||
url = f"https://www.something.com/data{'a' * 500}.txt"
|
||||
assert len(url) > 255
|
||||
fname = unique_file_name(url)
|
||||
assert len(fname) == 255
|
||||
assert fname[-10:] == "aaaaaa.txt"
|
||||
assert fname.split("-")[1][:10] == "aaaaaaaaaa"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pool",
|
||||
[ThreadPoolExecutor, ProcessPoolExecutor],
|
||||
ids=["threads", "processes"],
|
||||
)
|
||||
def test_make_local_storage_parallel(pool, monkeypatch):
|
||||
"Try to create the cache folder in parallel"
|
||||
# Can cause multiple attempts at creating the folder which leads to an
|
||||
# exception. Check that this doesn't happen.
|
||||
# See https://github.com/fatiando/pooch/issues/170
|
||||
|
||||
# Monkey path makedirs to make it delay before creating the directory.
|
||||
# Otherwise, the dispatch is too fast and the directory will exist before
|
||||
# another process tries to create it.
|
||||
|
||||
# Need to keep a reference to the original function to avoid infinite
|
||||
# recursions from the monkey patching.
|
||||
makedirs = os.makedirs
|
||||
|
||||
def mockmakedirs(path, exist_ok=False): # pylint: disable=unused-argument
|
||||
"Delay before calling makedirs"
|
||||
time.sleep(1.5)
|
||||
makedirs(path, exist_ok=exist_ok)
|
||||
|
||||
monkeypatch.setattr(os, "makedirs", mockmakedirs)
|
||||
|
||||
data_cache = os.path.join(os.curdir, "test_parallel_cache")
|
||||
assert not os.path.exists(data_cache)
|
||||
|
||||
try:
|
||||
with pool() as executor:
|
||||
futures = [
|
||||
executor.submit(make_local_storage, data_cache) for i in range(4)
|
||||
]
|
||||
for future in futures:
|
||||
future.result()
|
||||
assert os.path.exists(data_cache)
|
||||
finally:
|
||||
if os.path.exists(data_cache):
|
||||
shutil.rmtree(data_cache)
|
||||
|
||||
|
||||
def test_local_storage_makedirs_permissionerror(monkeypatch):
|
||||
"Should warn the user when can't create the local data dir"
|
||||
|
||||
def mockmakedirs(path, exist_ok=False): # pylint: disable=unused-argument
|
||||
"Raise an exception to mimic permission issues"
|
||||
raise PermissionError("Fake error")
|
||||
|
||||
data_cache = os.path.join(os.curdir, "test_permission")
|
||||
assert not os.path.exists(data_cache)
|
||||
|
||||
monkeypatch.setattr(os, "makedirs", mockmakedirs)
|
||||
|
||||
with pytest.raises(PermissionError) as error:
|
||||
make_local_storage(
|
||||
path=data_cache,
|
||||
env="SOME_VARIABLE",
|
||||
)
|
||||
assert "Pooch could not create data cache" in str(error)
|
||||
assert "'SOME_VARIABLE'" in str(error)
|
||||
|
||||
|
||||
def test_local_storage_newfile_permissionerror(monkeypatch):
|
||||
"Should warn the user when can't write to the local data dir"
|
||||
# This is a separate function because there should be a warning if the data
|
||||
# dir already exists but we can't write to it.
|
||||
|
||||
def mocktempfile(**kwargs): # pylint: disable=unused-argument
|
||||
"Raise an exception to mimic permission issues"
|
||||
raise PermissionError("Fake error")
|
||||
|
||||
with TemporaryDirectory() as data_cache:
|
||||
os.makedirs(os.path.join(data_cache, "1.0"))
|
||||
assert os.path.exists(data_cache)
|
||||
|
||||
monkeypatch.setattr(tempfile, "NamedTemporaryFile", mocktempfile)
|
||||
|
||||
with pytest.raises(PermissionError) as error:
|
||||
make_local_storage(
|
||||
path=data_cache,
|
||||
env="SOME_VARIABLE",
|
||||
)
|
||||
assert "Pooch could not write to data cache" in str(error)
|
||||
assert "'SOME_VARIABLE'" in str(error)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,output",
|
||||
[
|
||||
(
|
||||
"http://127.0.0.1:8080/test.nc",
|
||||
{"protocol": "http", "netloc": "127.0.0.1:8080", "path": "/test.nc"},
|
||||
),
|
||||
(
|
||||
"ftp://127.0.0.1:8080/test.nc",
|
||||
{"protocol": "ftp", "netloc": "127.0.0.1:8080", "path": "/test.nc"},
|
||||
),
|
||||
(
|
||||
"doi:10.6084/m9.figshare.923450.v1/dike.json",
|
||||
{
|
||||
"protocol": "doi",
|
||||
"netloc": "10.6084/m9.figshare.923450.v1",
|
||||
"path": "/dike.json",
|
||||
},
|
||||
),
|
||||
(
|
||||
r"doi:10.5281/zenodo.7632643/santisoler/pooch-test-data-v1.zip",
|
||||
{
|
||||
"protocol": "doi",
|
||||
"netloc": "10.5281/zenodo.7632643",
|
||||
"path": "/santisoler/pooch-test-data-v1.zip",
|
||||
},
|
||||
),
|
||||
],
|
||||
ids=["http", "ftp", "doi", "zenodo-doi-with-slash"],
|
||||
)
|
||||
def test_parse_url(url, output):
|
||||
"Parse URL into 3 components"
|
||||
assert parse_url(url) == output
|
||||
|
||||
|
||||
def test_parse_url_invalid_doi():
|
||||
"Should fail if we forget to not include // in the DOI link"
|
||||
with pytest.raises(ValueError):
|
||||
parse_url("doi://XXX/XXX/fname.txt")
|
||||
|
||||
|
||||
def test_temporary_file():
|
||||
"Make sure the file is writable and cleaned up in the end"
|
||||
with temporary_file() as tmp:
|
||||
assert Path(tmp).exists()
|
||||
with open(tmp, "w", encoding="utf-8") as outfile:
|
||||
outfile.write("Meh")
|
||||
with open(tmp, encoding="utf-8") as infile:
|
||||
assert infile.read().strip() == "Meh"
|
||||
assert not Path(tmp).exists()
|
||||
|
||||
|
||||
def test_temporary_file_path():
|
||||
"Make sure the file is writable and cleaned up in the end when given a dir"
|
||||
with TemporaryDirectory() as path:
|
||||
with temporary_file(path) as tmp:
|
||||
assert Path(tmp).exists()
|
||||
assert path in tmp
|
||||
with open(tmp, "w", encoding="utf-8") as outfile:
|
||||
outfile.write("Meh")
|
||||
with open(tmp, encoding="utf-8") as infile:
|
||||
assert infile.read().strip() == "Meh"
|
||||
assert not Path(tmp).exists()
|
||||
|
||||
|
||||
def test_temporary_file_exception():
|
||||
"Make sure the file is writable and cleaned up when there is an exception"
|
||||
try:
|
||||
with temporary_file() as tmp:
|
||||
assert Path(tmp).exists()
|
||||
raise ValueError("Nooooooooo!")
|
||||
except ValueError:
|
||||
assert not Path(tmp).exists()
|
||||
@@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Test the version.
|
||||
"""
|
||||
from packaging.version import Version
|
||||
|
||||
import pooch
|
||||
|
||||
|
||||
def test_version():
|
||||
"Check there's a usable version number in the usual __version__"
|
||||
assert pooch.__version__.startswith("v")
|
||||
# Check that it's PEP440 compliant (will raise an exception otherwise)
|
||||
Version(pooch.__version__)
|
||||
@@ -0,0 +1,237 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Utilities for testing code.
|
||||
"""
|
||||
import os
|
||||
import io
|
||||
import logging
|
||||
import shutil
|
||||
import stat
|
||||
from pathlib import Path
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .. import __version__ as full_version
|
||||
from ..utils import check_version, get_logger
|
||||
|
||||
|
||||
def check_tiny_data(fname):
|
||||
"""
|
||||
Load the tiny-data.txt file and check that the contents are correct.
|
||||
"""
|
||||
assert os.path.exists(fname)
|
||||
with open(fname, encoding="utf-8") as tinydata:
|
||||
content = tinydata.read()
|
||||
true_content = "\n".join(
|
||||
["# A tiny data file for test purposes only", "1 2 3 4 5 6"]
|
||||
)
|
||||
assert content.strip() == true_content
|
||||
|
||||
|
||||
def check_large_data(fname):
|
||||
"""
|
||||
Load the large-data.txt file and check that the contents are correct.
|
||||
"""
|
||||
assert os.path.exists(fname)
|
||||
with open(fname, encoding="utf-8") as data:
|
||||
content = data.read()
|
||||
true_content = ["# A larer data file for test purposes only"]
|
||||
true_content.extend(["1 2 3 4 5 6"] * 6002)
|
||||
assert content.strip() == "\n".join(true_content)
|
||||
|
||||
|
||||
def pooch_test_url():
|
||||
"""
|
||||
Get the base URL for the test data used in Pooch itself.
|
||||
|
||||
The URL is a GitHub raw link to the ``pooch/tests/data`` directory from the
|
||||
`GitHub repository <https://github.com/fatiando/pooch>`__. It matches the
|
||||
pooch version specified in ``pooch.version.full_version``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
url
|
||||
The versioned URL for pooch's test data.
|
||||
|
||||
"""
|
||||
version = check_version(full_version, fallback="main")
|
||||
url = f"https://github.com/fatiando/pooch/raw/{version}/pooch/tests/data/"
|
||||
return url
|
||||
|
||||
|
||||
def pooch_test_figshare_url():
|
||||
"""
|
||||
Get the base URL for the test data stored in figshare.
|
||||
|
||||
The URL contains the DOI for the figshare dataset using the appropriate
|
||||
version for this version of Pooch.
|
||||
|
||||
Returns
|
||||
-------
|
||||
url
|
||||
The URL for pooch's test data.
|
||||
|
||||
"""
|
||||
url = "doi:10.6084/m9.figshare.14763051.v1/"
|
||||
return url
|
||||
|
||||
|
||||
def pooch_test_zenodo_url():
|
||||
"""
|
||||
Get the base URL for the test data stored in Zenodo.
|
||||
|
||||
The URL contains the DOI for the Zenodo dataset using the appropriate
|
||||
version for this version of Pooch.
|
||||
|
||||
Returns
|
||||
-------
|
||||
url
|
||||
The URL for pooch's test data.
|
||||
|
||||
"""
|
||||
url = "doi:10.5281/zenodo.4924875/"
|
||||
return url
|
||||
|
||||
|
||||
def pooch_test_zenodo_with_slash_url():
|
||||
"""
|
||||
Get base URL for test data in Zenodo, where the file name contains a slash
|
||||
|
||||
The URL contains the DOI for the Zenodo dataset that has a slash in the
|
||||
filename (created with the GitHub-Zenodo integration service), using the
|
||||
appropriate version for this version of Pooch.
|
||||
|
||||
Returns
|
||||
-------
|
||||
url
|
||||
The URL for pooch's test data.
|
||||
|
||||
"""
|
||||
url = "doi:10.5281/zenodo.7632643/"
|
||||
return url
|
||||
|
||||
|
||||
def pooch_test_dataverse_url():
|
||||
"""
|
||||
Get the base URL for the test data stored on a DataVerse instance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
url
|
||||
The URL for pooch's test data.
|
||||
"""
|
||||
url = "doi:10.11588/data/TKCFEF/"
|
||||
return url
|
||||
|
||||
|
||||
def pooch_test_registry():
|
||||
"""
|
||||
Get a registry for the test data used in Pooch itself.
|
||||
|
||||
Returns
|
||||
-------
|
||||
registry
|
||||
Dictionary with pooch's test data files and their hashes.
|
||||
|
||||
"""
|
||||
registry = {
|
||||
"tiny-data.txt": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
|
||||
"large-data.txt": "98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a",
|
||||
"subdir/tiny-data.txt": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
|
||||
"tiny-data.zip": "0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb",
|
||||
"store.zip": "0498d2a001e71051bbd2acd2346f38da7cbd345a633cb7bf0f8a20938714b51a",
|
||||
"tiny-data.tar.gz": "41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b",
|
||||
"store.tar.gz": "088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511",
|
||||
"tiny-data.txt.bz2": "753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306",
|
||||
"tiny-data.txt.gz": "2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52",
|
||||
"tiny-data.txt.xz": "99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765",
|
||||
}
|
||||
return registry
|
||||
|
||||
|
||||
@contextmanager
|
||||
def capture_log(level=logging.DEBUG):
|
||||
"""
|
||||
Create a context manager for reading from the logs.
|
||||
|
||||
Yields
|
||||
------
|
||||
log_file : StringIO
|
||||
a file-like object to which the logs were written
|
||||
"""
|
||||
log_file = io.StringIO()
|
||||
handler = logging.StreamHandler(log_file)
|
||||
handler.setLevel(level)
|
||||
get_logger().addHandler(handler)
|
||||
yield log_file
|
||||
get_logger().removeHandler(handler)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def data_over_ftp(server, fname):
|
||||
"""
|
||||
Add a test data file to the test FTP server and clean it up afterwards.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
server
|
||||
The ``ftpserver`` fixture provided by pytest-localftpserver.
|
||||
fname : str
|
||||
The name of a file *relative* to the test data folder of the package
|
||||
(usually just the file name, not the full path).
|
||||
|
||||
Yields
|
||||
------
|
||||
url : str
|
||||
The download URL of the data file from the test FTP server.
|
||||
|
||||
"""
|
||||
package_path = str(Path(__file__).parent / "data" / fname)
|
||||
server_path = os.path.join(server.anon_root, fname)
|
||||
try:
|
||||
shutil.copyfile(package_path, server_path)
|
||||
url = f"ftp://localhost/{fname}"
|
||||
yield url
|
||||
finally:
|
||||
if os.path.exists(server_path):
|
||||
os.remove(server_path)
|
||||
|
||||
|
||||
def _recursive_chmod_directories(root, mode):
|
||||
"""
|
||||
Recursively change the permissions on the child directories using a bitwise
|
||||
OR operation.
|
||||
"""
|
||||
for item in root.iterdir():
|
||||
if item.is_dir():
|
||||
item.chmod(item.stat().st_mode | mode)
|
||||
_recursive_chmod_directories(item, mode)
|
||||
|
||||
|
||||
def mirror_directory(source, destination):
|
||||
"""
|
||||
Copy contents of the source directory into destination and fix permissions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str, :class:`pathlib.Path`
|
||||
Source data directory.
|
||||
destination : str, :class:`pathlib.Path`
|
||||
Destination directory that will contain the copy of source. The actual
|
||||
source directory (not just it's contents) is copied.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mirror : :class:`pathlib.Path`
|
||||
The path of the mirrored output directory.
|
||||
|
||||
"""
|
||||
source = Path(source)
|
||||
mirror = Path(destination) / source.name
|
||||
shutil.copytree(source, mirror)
|
||||
_recursive_chmod_directories(mirror, mode=stat.S_IWUSR)
|
||||
return mirror
|
||||
@@ -0,0 +1,72 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Custom classes for type annotations
|
||||
|
||||
This module provides additional `PEP 484 <https://peps.python.org/pep-0484/>`_
|
||||
type aliases used in ``pooch``'s codebase.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Literal,
|
||||
Optional,
|
||||
Protocol,
|
||||
TypedDict,
|
||||
Union,
|
||||
)
|
||||
|
||||
# Import Pooch only if TYPE_CHECKING is true to avoid circular loops at runtime
|
||||
if TYPE_CHECKING:
|
||||
from .. import Pooch
|
||||
|
||||
|
||||
__all__ = [
|
||||
"Action",
|
||||
"Downloader",
|
||||
"PathType",
|
||||
"PathInputType",
|
||||
"ParsedURL",
|
||||
"Processor",
|
||||
]
|
||||
|
||||
|
||||
Action = Literal["download", "fetch", "update"]
|
||||
PathType = Union[str, os.PathLike]
|
||||
PathInputType = Union[PathType, list[PathType], tuple[PathType]]
|
||||
Processor = Callable[[str, Action, Optional["Pooch"]], Any]
|
||||
|
||||
|
||||
class Downloader(Protocol):
|
||||
"""
|
||||
Class used to define the type definition for the downloader function.
|
||||
"""
|
||||
|
||||
# pylint: disable=too-few-public-methods
|
||||
def __call__( # noqa: E704
|
||||
self,
|
||||
fname: str,
|
||||
action: Optional[PathType],
|
||||
pooch: Optional["Pooch"],
|
||||
*,
|
||||
check_only: Optional[bool] = None,
|
||||
) -> Any: ...
|
||||
|
||||
|
||||
class ParsedURL(TypedDict):
|
||||
"""
|
||||
Type for a dictionary generated after parsing a URL.
|
||||
|
||||
The dictionary contains three keys: protocol, netloc and path.
|
||||
"""
|
||||
|
||||
protocol: str
|
||||
netloc: str
|
||||
path: str
|
||||
Binary file not shown.
356
linedance-app/venv/lib/python3.12/site-packages/pooch/utils.py
Normal file
356
linedance-app/venv/lib/python3.12/site-packages/pooch/utils.py
Normal file
@@ -0,0 +1,356 @@
|
||||
# Copyright (c) 2018 The Pooch Developers.
|
||||
# Distributed under the terms of the BSD 3-Clause License.
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
#
|
||||
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
|
||||
#
|
||||
"""
|
||||
Misc utilities
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlsplit
|
||||
from contextlib import contextmanager
|
||||
import warnings
|
||||
from typing import Optional, Any, Generator
|
||||
|
||||
import platformdirs
|
||||
from packaging.version import Version
|
||||
|
||||
from .typing import ParsedURL, PathType, PathInputType
|
||||
|
||||
|
||||
LOGGER = logging.Logger("pooch")
|
||||
LOGGER.addHandler(logging.StreamHandler())
|
||||
|
||||
|
||||
def file_hash(*args, **kwargs) -> Any:
|
||||
"""
|
||||
WARNING: Importing this function from pooch.utils is DEPRECATED.
|
||||
Please import from the top-level namespace (`from pooch import file_hash`)
|
||||
instead, which is fully backwards compatible with pooch >= 0.1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> fname = "test-file-for-hash.txt"
|
||||
>>> with open(fname, "w") as f:
|
||||
... __ = f.write("content of the file")
|
||||
>>> print(file_hash(fname))
|
||||
0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
|
||||
>>> import os
|
||||
>>> os.remove(fname)
|
||||
|
||||
"""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from .hashes import file_hash as new_file_hash
|
||||
|
||||
message = """
|
||||
Importing file_hash from pooch.utils is DEPRECATED. Please import from the
|
||||
top-level namespace (`from pooch import file_hash`) instead, which is fully
|
||||
backwards compatible with pooch >= 0.1.
|
||||
"""
|
||||
warnings.warn(message, DeprecationWarning, stacklevel=2)
|
||||
return new_file_hash(*args, **kwargs)
|
||||
|
||||
|
||||
def get_logger() -> logging.Logger:
|
||||
r"""
|
||||
Get the default event logger.
|
||||
|
||||
The logger records events like downloading files, unzipping archives, etc.
|
||||
Use the method :meth:`logging.Logger.setLevel` of this object to adjust the
|
||||
verbosity level from Pooch.
|
||||
|
||||
Returns
|
||||
-------
|
||||
logger : :class:`logging.Logger`
|
||||
The logger object for Pooch
|
||||
"""
|
||||
return LOGGER
|
||||
|
||||
|
||||
def os_cache(project: str) -> Path:
|
||||
r"""
|
||||
Default cache location based on the operating system.
|
||||
|
||||
The folder locations are defined by the ``platformdirs`` package
|
||||
using the ``user_cache_dir`` function.
|
||||
Usually, the locations will be following (see the
|
||||
`platformdirs documentation <https://platformdirs.readthedocs.io>`__):
|
||||
|
||||
* Mac: ``~/Library/Caches/<AppName>``
|
||||
* Unix: ``~/.cache/<AppName>`` or the value of the ``XDG_CACHE_HOME``
|
||||
environment variable, if defined.
|
||||
* Windows: ``C:\Users\<user>\AppData\Local\<AppAuthor>\<AppName>\Cache``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
project : str
|
||||
The project name.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cache_path : :class:`pathlib.Path`
|
||||
The default location for the data cache. User directories (``'~'``) are
|
||||
not expanded.
|
||||
|
||||
"""
|
||||
return Path(platformdirs.user_cache_dir(project))
|
||||
|
||||
|
||||
def check_version(version: str, fallback: str = "master") -> str:
|
||||
"""
|
||||
Check if a version is PEP440 compliant and there are no unreleased changes.
|
||||
|
||||
For example, ``version = "0.1"`` will be returned as is but ``version =
|
||||
"0.1+10.8dl8dh9"`` will return the fallback. This is the convention used by
|
||||
`versioneer <https://github.com/warner/python-versioneer>`__ to mark that
|
||||
this version is 10 commits ahead of the last release.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
version : str
|
||||
A version string.
|
||||
fallback : str
|
||||
What to return if the version string has unreleased changes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
version : str
|
||||
If *version* is PEP440 compliant and there are unreleased changes, then
|
||||
return *version*. Otherwise, return *fallback*.
|
||||
|
||||
Raises
|
||||
------
|
||||
InvalidVersion
|
||||
If *version* is not PEP440 compliant.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> check_version("0.1")
|
||||
'0.1'
|
||||
>>> check_version("0.1a10")
|
||||
'0.1a10'
|
||||
>>> check_version("0.1+111.9hdg36")
|
||||
'master'
|
||||
>>> check_version("0.1+111.9hdg36", fallback="dev")
|
||||
'dev'
|
||||
|
||||
"""
|
||||
parse = Version(version)
|
||||
if parse.local is not None:
|
||||
return fallback
|
||||
return version
|
||||
|
||||
|
||||
def parse_url(url: str) -> ParsedURL:
|
||||
"""
|
||||
Parse a URL into 3 components:
|
||||
|
||||
<protocol>://<netloc>/<path>
|
||||
|
||||
Example URLs:
|
||||
|
||||
* http://127.0.0.1:8080/test.nc
|
||||
* ftp://127.0.0.1:8080/test.nc
|
||||
* doi:10.6084/m9.figshare.923450.v1/test.nc
|
||||
|
||||
The DOI is a special case. The protocol will be "doi", the netloc will be
|
||||
the DOI, and the path is what comes after the last "/".
|
||||
The only exception are Zenodo dois: the protocol will be "doi", the netloc
|
||||
will be composed by the "prefix/suffix" and the path is what comes after
|
||||
the second "/". This allows to support special cases of Zenodo dois where
|
||||
the path contains forward slashes "/", created by the GitHub-Zenodo
|
||||
integration service.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
The URL.
|
||||
|
||||
Returns
|
||||
-------
|
||||
parsed_url : dict
|
||||
Three components of a URL (e.g.,
|
||||
``{'protocol':'http', 'netloc':'127.0.0.1:8080','path': '/test.nc'}``).
|
||||
|
||||
"""
|
||||
if url.startswith("doi://"):
|
||||
raise ValueError(
|
||||
f"Invalid DOI link '{url}'. You must not use '//' after 'doi:'."
|
||||
)
|
||||
if url.startswith("doi:"):
|
||||
protocol = "doi"
|
||||
parts = url[4:].split("/")
|
||||
if "zenodo" in parts[1].lower():
|
||||
netloc = "/".join(parts[:2])
|
||||
path = "/" + "/".join(parts[2:])
|
||||
else:
|
||||
netloc = "/".join(parts[:-1])
|
||||
path = "/" + parts[-1]
|
||||
else:
|
||||
parsed_url = urlsplit(url)
|
||||
protocol = parsed_url.scheme or "file"
|
||||
netloc = parsed_url.netloc
|
||||
path = parsed_url.path
|
||||
return {"protocol": protocol, "netloc": netloc, "path": path}
|
||||
|
||||
|
||||
def cache_location(
|
||||
path: PathInputType, env: Optional[str] = None, version: Optional[str] = None
|
||||
) -> Path:
|
||||
"""
|
||||
Location of the cache given a base path and optional configuration.
|
||||
|
||||
Checks for the environment variable to overwrite the path of the local
|
||||
cache. Optionally add *version* to the path if given.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, PathLike, list or tuple
|
||||
The path to the local data storage folder. If this is a list or tuple,
|
||||
we'll join the parts with the appropriate separator. Use
|
||||
:func:`pooch.os_cache` for a sensible default.
|
||||
version : str or None
|
||||
The version string for your project. Will be appended to given path if
|
||||
not None.
|
||||
env : str or None
|
||||
An environment variable that can be used to overwrite *path*. This
|
||||
allows users to control where they want the data to be stored. We'll
|
||||
append *version* to the end of this value as well.
|
||||
|
||||
Returns
|
||||
-------
|
||||
local_path : PathLike
|
||||
The path to the local directory.
|
||||
|
||||
"""
|
||||
if env is not None and env in os.environ and os.environ[env]:
|
||||
path = os.environ[env]
|
||||
if isinstance(path, (list, tuple)):
|
||||
path = os.path.join(*path)
|
||||
if version is not None:
|
||||
path = os.path.join(str(path), version)
|
||||
path = os.path.expanduser(str(path))
|
||||
return Path(path)
|
||||
|
||||
|
||||
def make_local_storage(path: PathType, env: Optional[str] = None) -> None:
|
||||
"""
|
||||
Create the local cache directory and make sure it's writable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str or PathLike
|
||||
The path to the local data storage folder.
|
||||
env : str or None
|
||||
An environment variable that can be used to overwrite *path*. Only used
|
||||
in the error message in case the folder is not writable.
|
||||
"""
|
||||
path = str(path)
|
||||
# Check that the data directory is writable
|
||||
if not os.path.exists(path):
|
||||
action = "create"
|
||||
else:
|
||||
action = "write to"
|
||||
|
||||
try:
|
||||
if action == "create":
|
||||
# When running in parallel, it's possible that multiple jobs will
|
||||
# try to create the path at the same time. Use exist_ok to avoid
|
||||
# raising an error.
|
||||
os.makedirs(path, exist_ok=True)
|
||||
else:
|
||||
with tempfile.NamedTemporaryFile(dir=path):
|
||||
pass
|
||||
except PermissionError as error:
|
||||
message = [
|
||||
str(error),
|
||||
f"| Pooch could not {action} data cache folder '{path}'.",
|
||||
"Will not be able to download data files.",
|
||||
]
|
||||
if env is not None:
|
||||
message.append(
|
||||
f"Use environment variable '{env}' to specify a different location."
|
||||
)
|
||||
raise PermissionError(" ".join(message)) from error
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temporary_file(path: Optional[PathType] = None) -> Generator[str, None, None]:
|
||||
"""
|
||||
Create a closed and named temporary file and make sure it's cleaned up.
|
||||
|
||||
Using :class:`tempfile.NamedTemporaryFile` will fail on Windows if trying
|
||||
to open the file a second time (when passing its name to Pooch function,
|
||||
for example). This context manager creates the file, closes it, yields the
|
||||
file path, and makes sure it's deleted in the end.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str or PathLike
|
||||
The directory in which the temporary file will be created.
|
||||
|
||||
Yields
|
||||
------
|
||||
fname : str
|
||||
The path to the temporary file.
|
||||
|
||||
"""
|
||||
tmp = tempfile.NamedTemporaryFile(delete=False, dir=path) # type: ignore
|
||||
# Close the temp file so that it can be opened elsewhere
|
||||
tmp.close()
|
||||
try:
|
||||
yield tmp.name
|
||||
finally:
|
||||
if os.path.exists(tmp.name):
|
||||
os.remove(tmp.name)
|
||||
|
||||
|
||||
def unique_file_name(url: str) -> str:
|
||||
"""
|
||||
Create a unique file name based on the given URL.
|
||||
|
||||
The file name will be unique to the URL by prepending the name with the MD5
|
||||
hash (hex digest) of the URL. The name will also include the last portion
|
||||
of the URL.
|
||||
|
||||
The format will be: ``{md5}-{filename}.{ext}``
|
||||
|
||||
The file name will be cropped so that the entire name (including the hash)
|
||||
is less than 255 characters long (the limit on most file systems).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
url : str
|
||||
The URL with a file name at the end.
|
||||
|
||||
Returns
|
||||
-------
|
||||
fname : str
|
||||
The file name, unique to this URL.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> print(unique_file_name("https://www.some-server.org/2020/data.txt"))
|
||||
02ddee027ce5ebb3d7059fb23d210604-data.txt
|
||||
>>> print(unique_file_name("https://www.some-server.org/2019/data.txt"))
|
||||
9780092867b497fca6fc87d8308f1025-data.txt
|
||||
>>> print(unique_file_name("https://www.some-server.org/2020/data.txt.gz"))
|
||||
181a9d52e908219c2076f55145d6a344-data.txt.gz
|
||||
|
||||
"""
|
||||
md5 = hashlib.md5(url.encode(), usedforsecurity=False).hexdigest()
|
||||
fname = parse_url(url)["path"].split("/")[-1]
|
||||
# Crop the start of the file name to fit 255 characters including the hash
|
||||
# and the :
|
||||
fname = fname[-(255 - len(md5) - 1) :]
|
||||
unique_name = f"{md5}-{fname}"
|
||||
return unique_name
|
||||
Reference in New Issue
Block a user