This commit is contained in:
2026-04-10 15:06:59 +02:00
parent 3031b7153b
commit e5a4711004
7806 changed files with 1918528 additions and 335 deletions

View File

@@ -0,0 +1,69 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
# pylint: disable=missing-docstring,import-outside-toplevel,import-self
#
# Import functions/classes to make the API
from .core import Pooch, create, retrieve
from .utils import os_cache, check_version, get_logger
from .hashes import file_hash, make_registry
from .downloaders import (
HTTPDownloader,
FTPDownloader,
SFTPDownloader,
DOIDownloader,
)
from .processors import Unzip, Untar, Decompress
# This file is generated automatically by setuptools_scm
from . import _version # type: ignore
# Add a "v" to the version number
__version__ = f"v{_version.version}"
def test(doctest=True, verbose=True, coverage=False):
"""
Run the test suite.
Uses `py.test <http://pytest.org/>`__ to discover and run the tests.
Parameters
----------
doctest : bool
If ``True``, will run the doctests as well (code examples that start
with a ``>>>`` in the docs).
verbose : bool
If ``True``, will print extra information during the test run.
coverage : bool
If ``True``, will run test coverage analysis on the code as well.
Requires ``pytest-cov``.
Raises
------
AssertionError
If pytest returns a non-zero error code indicating that some tests have
failed.
"""
import pytest
package = __name__
args = []
if verbose:
args.append("-vv")
if coverage:
args.append(f"--cov={package}")
args.append("--cov-report=term-missing")
if doctest:
args.append("--doctest-modules")
args.append("--pyargs")
args.append(package)
status = pytest.main(args)
assert status == 0, "Some tests have failed."

View File

@@ -0,0 +1,34 @@
# file generated by setuptools-scm
# don't change, don't track in version control
__all__ = [
"__version__",
"__version_tuple__",
"version",
"version_tuple",
"__commit_id__",
"commit_id",
]
TYPE_CHECKING = False
if TYPE_CHECKING:
from typing import Tuple
from typing import Union
VERSION_TUPLE = Tuple[Union[int, str], ...]
COMMIT_ID = Union[str, None]
else:
VERSION_TUPLE = object
COMMIT_ID = object
version: str
__version__: str
__version_tuple__: VERSION_TUPLE
version_tuple: VERSION_TUPLE
commit_id: COMMIT_ID
__commit_id__: COMMIT_ID
__version__ = version = '1.9.0'
__version_tuple__ = version_tuple = (1, 9, 0)
__commit_id__ = commit_id = None

View File

@@ -0,0 +1,838 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
The main Pooch class and a factory function for it.
"""
import os
import time
import contextlib
from pathlib import Path
import shlex
import shutil
from typing import Union, Optional, Any
from .hashes import hash_matches, file_hash
from .utils import (
check_version,
get_logger,
make_local_storage,
cache_location,
temporary_file,
os_cache,
unique_file_name,
)
from .downloaders import DOIDownloader, choose_downloader, doi_to_repository
from .typing import PathType, PathInputType, Processor, Downloader, Action
def retrieve(
url: str,
known_hash: Optional[str] = None,
fname: Optional[str] = None,
path: Optional[PathType] = None,
processor: Optional[Processor] = None,
downloader: Optional[Downloader] = None,
progressbar: bool = False,
) -> str:
"""
Download and cache a single file locally.
Uses HTTP or FTP by default, depending on the protocol in the given *url*.
Other download methods can be controlled through the *downloader* argument
(see below).
The file will be downloaded to a temporary location first and its hash will
be compared to the given *known_hash*. This is done to ensure that the
download happened correctly and securely. If the hash doesn't match, the
file will be deleted and an exception will be raised.
If the file already exists locally, its hash will be compared to
*known_hash*. If they are not the same, this is interpreted as the file
needing to be updated and it will be downloaded again.
You can bypass these checks by passing ``known_hash=None``. If this is
done, the SHA256 hash of the downloaded file will be logged to the screen.
It is highly recommended that you copy and paste this hash as *known_hash*
so that future downloads are guaranteed to be the exact same file. This is
crucial for reproducible computations.
If the file exists in the given *path* with the given *fname* and the hash
matches, it will not be downloaded and the absolute path to the file will
be returned.
.. note::
This function is meant for downloading single files. If you need to
manage the download and caching of several files, with versioning, use
:func:`pooch.create` and :class:`pooch.Pooch` instead.
Parameters
----------
url : str
The URL to the file that is to be downloaded. Ideally, the URL should
end in a file name.
known_hash : str or None
A known hash (checksum) of the file. Will be used to verify the
download or check if an existing file needs to be updated. By default,
will assume it's a SHA256 hash. To specify a different hashing method,
prepend the hash with ``algorithm:``, for example
``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``. If
None, will NOT check the hash of the downloaded file or check if an
existing file needs to be updated.
fname : str or None
The name that will be used to save the file. Should NOT include the
full path, just the file name (it will be appended to *path*). If
None, will create a unique file name using a combination of the last
part of the URL (assuming it's the file name) and the MD5 hash of the
URL. For example, ``81whdo2d2e928yd1wi22-data-file.csv``. This ensures
that files from different URLs never overwrite each other, even if they
have the same name.
path : str or PathLike or None
The location of the cache folder on disk. This is where the file will
be saved. If None, will save to a ``pooch`` folder in the default cache
location for your operating system (see :func:`pooch.os_cache`).
processor : None or callable
If not None, then a function (or callable object) that will be called
before returning the full path and after the file has been downloaded
(if required). See :ref:`processors` for details.
downloader : None or callable
If not None, then a function (or callable object) that will be called
to download a given URL to a provided local file name. See
:ref:`downloaders` for details.
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard error
(stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to be
installed. Alternatively, an arbitrary progress bar object can be
passed. See :ref:`custom-progressbar` for details.
Returns
-------
full_path : str
The absolute path (including the file name) of the file in the local
storage.
Examples
--------
Download one of the data files from the Pooch repository on GitHub:
>>> import os
>>> from pooch import __version__, check_version, retrieve
>>> # Make a URL for the version of pooch we have installed
>>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
>>> url = url.format(check_version(__version__, fallback="main"))
>>> # Download the file and save it locally. Will check the MD5 checksum of
>>> # the downloaded file against the given value to make sure it's the
>>> # right file. You can use other hashes by specifying different
>>> # algorithm names (sha256, sha1, etc).
>>> fname = retrieve(
... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
... )
>>> with open(fname) as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> # Running again won't trigger a download and only return the path to
>>> # the existing file.
>>> fname2 = retrieve(
... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
... )
>>> print(fname2 == fname)
True
>>> os.remove(fname)
Files that are compressed with gzip, xz/lzma, or bzip2 can be automatically
decompressed by passing using the :class:`pooch.Decompress` processor:
>>> from pooch import Decompress
>>> # URLs to a gzip compressed version of the data file.
>>> url = ("https://github.com/fatiando/pooch/raw/{}/"
... + "pooch/tests/data/tiny-data.txt.gz")
>>> url = url.format(check_version(__version__, fallback="main"))
>>> # By default, you would have to decompress the file yourself
>>> fname = retrieve(
... url,
... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
... )
>>> print(os.path.splitext(fname)[1])
.gz
>>> # Use the processor to decompress after download automatically and
>>> # return the path to the decompressed file instead.
>>> fname2 = retrieve(
... url,
... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
... processor=Decompress(),
... )
>>> print(fname2 == fname)
False
>>> with open(fname2) as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove(fname)
>>> os.remove(fname2)
When downloading archives (zip or tar), it can be useful to unpack them
after download to avoid having to do that yourself. Use the processors
:class:`pooch.Unzip` or :class:`pooch.Untar` to do this automatically:
>>> from pooch import Unzip
>>> # URLs to a zip archive with a single data file.
>>> url = ("https://github.com/fatiando/pooch/raw/{}/"
... + "pooch/tests/data/tiny-data.zip")
>>> url = url.format(check_version(__version__, fallback="main"))
>>> # By default, you would get the path to the archive
>>> fname = retrieve(
... url,
... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
... )
>>> print(os.path.splitext(fname)[1])
.zip
>>> os.remove(fname)
>>> # Using the processor, the archive will be unzipped and a list with the
>>> # path to every file will be returned instead of a single path.
>>> fnames = retrieve(
... url,
... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
... processor=Unzip(),
... )
>>> # There was only a single file in our archive.
>>> print(len(fnames))
1
>>> with open(fnames[0]) as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> for f in fnames:
... os.remove(f)
"""
if path is None:
path = os_cache("pooch")
if fname is None:
fname = unique_file_name(url)
# Make the path absolute.
path = cache_location(path, env=None, version=None)
full_path = path.resolve() / fname
action, verb = download_action(full_path, known_hash)
if action in ("download", "update"):
# We need to write data, so create the local data directory if it
# doesn't already exist.
make_local_storage(path)
get_logger().info(
"%s data from '%s' to file '%s'.",
verb,
url,
str(full_path),
)
if downloader is None:
downloader = choose_downloader(url, progressbar=progressbar)
stream_download(url, full_path, known_hash, downloader, pooch=None)
if known_hash is None:
get_logger().info(
"SHA256 hash of downloaded file: %s\n"
"Use this value as the 'known_hash' argument of 'pooch.retrieve'"
" to ensure that the file hasn't changed if it is downloaded again"
" in the future.",
file_hash(str(full_path)),
)
if processor is not None:
return processor(str(full_path), action, None)
return str(full_path)
def create(
path: PathInputType,
base_url: str,
version: Optional[str] = None,
version_dev: str = "master",
env: Optional[str] = None,
registry: Optional[dict] = None,
urls: Optional[dict] = None,
retry_if_failed: int = 0,
allow_updates: Union[bool, str] = True,
):
"""
Create a :class:`~pooch.Pooch` with sensible defaults to fetch data files.
If a version string is given, the Pooch will be versioned, meaning that the
local storage folder and the base URL depend on the project version. This
is necessary if your users have multiple versions of your library installed
(using virtual environments) and you updated the data files between
versions. Otherwise, every time a user switches environments would trigger
a re-download of the data. The version string will be appended to the local
storage path (for example, ``~/.mypooch/cache/v0.1``) and inserted into the
base URL (for example,
``https://github.com/fatiando/pooch/raw/v0.1/data``). If the version string
contains ``+XX.XXXXX``, it will be interpreted as a development version.
Does **not** create the local data storage folder. The folder will only be
created the first time a download is attempted with
:meth:`pooch.Pooch.fetch`. This makes it safe to use this function at the
module level (so it's executed on ``import`` and the resulting
:class:`~pooch.Pooch` is a global variable).
Parameters
----------
path : str, PathLike, list or tuple
The path to the local data storage folder. If this is a list or tuple,
we'll join the parts with the appropriate separator. The *version* will
be appended to the end of this path. Use :func:`pooch.os_cache` for a
sensible default.
base_url : str
Base URL for the remote data source. All requests will be made relative
to this URL. The string should have a ``{version}`` formatting mark in
it. We will call ``.format(version=version)`` on this string. If the
URL does not end in a ``'/'``, a trailing ``'/'`` will be added
automatically.
version : str or None
The version string for your project. Should be PEP440 compatible. If
None is given, will not attempt to format *base_url* and no subfolder
will be appended to *path*.
version_dev : str
The name used for the development version of a project. If your data is
hosted on Github (and *base_url* is a Github raw link), then
``"master"`` is a good choice (default). Ignored if *version* is None.
env : str or None
An environment variable that can be used to overwrite *path*. This
allows users to control where they want the data to be stored. We'll
append *version* to the end of this value as well.
registry : dict or None
A record of the files that are managed by this Pooch. Keys should be
the file names and the values should be their hashes. Only files
in the registry can be fetched from the local storage. Files in
subdirectories of *path* **must use Unix-style separators** (``'/'``)
even on Windows.
urls : dict or None
Custom URLs for downloading individual files in the registry. A
dictionary with the file names as keys and the custom URLs as values.
Not all files in *registry* need an entry in *urls*. If a file has an
entry in *urls*, the *base_url* will be ignored when downloading it in
favor of ``urls[fname]``.
retry_if_failed : int
Retry a file download the specified number of times if it fails because
of a bad connection or a hash mismatch. By default, downloads are only
attempted once (``retry_if_failed=0``). Initially, will wait for 1s
between retries and then increase the wait time by 1s with each retry
until a maximum of 10s.
allow_updates : bool or str
Whether existing files in local storage that have a hash mismatch with
the registry are allowed to update from the remote URL. If a string is
passed, we will assume it's the name of an environment variable that
will be checked for the true/false value. If ``False``, any mismatch
with hashes in the registry will result in an error. Defaults to
``True``.
Returns
-------
pooch : :class:`~pooch.Pooch`
The :class:`~pooch.Pooch` initialized with the given arguments.
Examples
--------
Create a :class:`~pooch.Pooch` for a release (v0.1):
>>> pup = create(path="myproject",
... base_url="http://some.link.com/{version}/",
... version="v0.1",
... registry={"data.txt": "9081wo2eb2gc0u..."})
>>> print(pup.path.parts) # The path is a pathlib.Path
('myproject', 'v0.1')
>>> # The local folder is only created when a dataset is first downloaded
>>> print(pup.path.exists())
False
>>> print(pup.base_url)
http://some.link.com/v0.1/
>>> print(pup.registry)
{'data.txt': '9081wo2eb2gc0u...'}
>>> print(pup.registry_files)
['data.txt']
If this is a development version (12 commits ahead of v0.1), then the
``version_dev`` will be used (defaults to ``"master"``):
>>> pup = create(path="myproject",
... base_url="http://some.link.com/{version}/",
... version="v0.1+12.do9iwd")
>>> print(pup.path.parts)
('myproject', 'master')
>>> print(pup.base_url)
http://some.link.com/master/
Versioning is optional (but highly encouraged):
>>> pup = create(path="myproject",
... base_url="http://some.link.com/",
... registry={"data.txt": "9081wo2eb2gc0u..."})
>>> print(pup.path.parts) # The path is a pathlib.Path
('myproject',)
>>> print(pup.base_url)
http://some.link.com/
To place the storage folder at a subdirectory, pass in a list and we'll
join the path for you using the appropriate separator for your operating
system:
>>> pup = create(path=["myproject", "cache", "data"],
... base_url="http://some.link.com/{version}/",
... version="v0.1")
>>> print(pup.path.parts)
('myproject', 'cache', 'data', 'v0.1')
The user can overwrite the storage path by setting an environment variable:
>>> # The variable is not set so we'll use *path*
>>> pup = create(path=["myproject", "not_from_env"],
... base_url="http://some.link.com/{version}/",
... version="v0.1",
... env="MYPROJECT_DATA_DIR")
>>> print(pup.path.parts)
('myproject', 'not_from_env', 'v0.1')
>>> # Set the environment variable and try again
>>> import os
>>> os.environ["MYPROJECT_DATA_DIR"] = os.path.join("myproject", "env")
>>> pup = create(path=["myproject", "not_env"],
... base_url="http://some.link.com/{version}/",
... version="v0.1",
... env="MYPROJECT_DATA_DIR")
>>> print(pup.path.parts)
('myproject', 'env', 'v0.1')
"""
if version is not None:
version = check_version(version, fallback=version_dev)
base_url = base_url.format(version=version)
# Don't create the cache folder here! This function is usually called in
# the module context (at import time), so touching the file system is not
# recommended. It could cause crashes when multiple processes/threads try
# to import at the same time (which would try to create the folder several
# times at once).
path = cache_location(path, env, version)
if isinstance(allow_updates, str):
allow_updates = os.environ.get(allow_updates, "true").lower() != "false"
# add trailing "/"
base_url = base_url.rstrip("/") + "/"
pup = Pooch(
path=path,
base_url=base_url,
registry=registry,
urls=urls,
retry_if_failed=retry_if_failed,
allow_updates=allow_updates,
)
return pup
class Pooch:
"""
Manager for a local data storage that can fetch from a remote source.
Avoid creating ``Pooch`` instances directly. Use :func:`pooch.create`
instead.
Parameters
----------
path : str
The path to the local data storage folder. The path must exist in the
file system.
base_url : str
Base URL for the remote data source. All requests will be made relative
to this URL.
registry : dict or None
A record of the files that are managed by this good boy. Keys should be
the file names and the values should be their hashes. Only files
in the registry can be fetched from the local storage. Files in
subdirectories of *path* **must use Unix-style separators** (``'/'``)
even on Windows.
urls : dict or None
Custom URLs for downloading individual files in the registry. A
dictionary with the file names as keys and the custom URLs as values.
Not all files in *registry* need an entry in *urls*. If a file has an
entry in *urls*, the *base_url* will be ignored when downloading it in
favor of ``urls[fname]``.
retry_if_failed : int
Retry a file download the specified number of times if it fails because
of a bad connection or a hash mismatch. By default, downloads are only
attempted once (``retry_if_failed=0``). Initially, will wait for 1s
between retries and then increase the wait time by 1s with each retry
until a maximum of 10s.
allow_updates : bool
Whether existing files in local storage that have a hash mismatch with
the registry are allowed to update from the remote URL. If ``False``,
any mismatch with hashes in the registry will result in an error.
Defaults to ``True``.
"""
def __init__(
self,
path: PathType,
base_url: str,
registry: Optional[dict[str, str]] = None,
urls: Optional[dict[str, str]] = None,
retry_if_failed: int = 0,
allow_updates: bool = True,
) -> None:
self.path = path
self.base_url = base_url
if registry is None:
registry = {}
self.registry = registry
if urls is None:
urls = {}
self.urls = dict(urls)
self.retry_if_failed = retry_if_failed
self.allow_updates = allow_updates
@property
def abspath(self) -> Path:
"Absolute path to the local storage"
return Path(os.path.abspath(os.path.expanduser(str(self.path))))
@property
def registry_files(self) -> list[str]:
"List of file names on the registry"
return list(self.registry)
def fetch(
self,
fname: str,
processor: Optional[Processor] = None,
downloader: Optional[Downloader] = None,
progressbar: bool = False,
) -> str:
"""
Get the absolute path to a file in the local storage.
If it's not in the local storage, it will be downloaded. If the hash of
the file in local storage doesn't match the one in the registry, will
download a new copy of the file. This is considered a sign that the
file was updated in the remote storage. If the hash of the downloaded
file still doesn't match the one in the registry, will raise an
exception to warn of possible file corruption.
Post-processing actions sometimes need to be taken on downloaded files
(unzipping, conversion to a more efficient format, etc). If these
actions are time or memory consuming, it would be best to do this only
once right after the file is downloaded. Use the *processor* argument
to specify a function that is executed after the download to perform
these actions. See :ref:`processors` for details.
Custom file downloaders can be provided through the *downloader*
argument. By default, Pooch will determine the download protocol from
the URL in the registry. If the server for a given file requires
authentication (username and password), use a downloader that support
these features. Downloaders can also be used to print custom messages
(like a progress bar), etc. See :ref:`downloaders` for details.
Parameters
----------
fname : str
The file name (relative to the *base_url* of the remote data
storage) to fetch from the local storage.
processor : None or callable
If not None, then a function (or callable object) that will be
called before returning the full path and after the file has been
downloaded. See :ref:`processors` for details.
downloader : None or callable
If not None, then a function (or callable object) that will be
called to download a given URL to a provided local file name. See
:ref:`downloaders` for details.
progressbar : bool or an arbitrary progress bar object
If True, will print a progress bar of the download to standard
error (stderr). Requires `tqdm <https://github.com/tqdm/tqdm>`__ to
be installed. Alternatively, an arbitrary progress bar object can
be passed. See :ref:`custom-progressbar` for details.
Returns
-------
full_path : str
The absolute path (including the file name) of the file in the
local storage.
"""
self._assert_file_in_registry(fname)
url = self.get_url(fname)
full_path = self.abspath / fname
known_hash = self.registry[fname]
action, verb = download_action(full_path, known_hash)
if action == "update" and not self.allow_updates:
raise ValueError(
f"{fname} needs to update {full_path} but updates are disallowed."
)
if action in ("download", "update"):
# We need to write data, so create the local data directory if it
# doesn't already exist.
make_local_storage(str(self.abspath))
get_logger().info(
"%s file '%s' from '%s' to '%s'.",
verb,
fname,
url,
str(self.abspath),
)
if downloader is None:
downloader = choose_downloader(url, progressbar=progressbar)
stream_download(
url,
full_path,
known_hash,
downloader,
pooch=self,
retry_if_failed=self.retry_if_failed,
)
if processor is not None:
return processor(str(full_path), action, self)
return str(full_path)
def _assert_file_in_registry(self, fname: str) -> None:
"""
Check if a file is in the registry and raise :class:`ValueError` if
it's not.
"""
if fname not in self.registry:
raise ValueError(f"File '{fname}' is not in the registry.")
def get_url(self, fname: str) -> str:
"""
Get the full URL to download a file in the registry.
Parameters
----------
fname : str
The file name (relative to the *base_url* of the remote data
storage) to fetch from the local storage.
"""
self._assert_file_in_registry(fname)
return self.urls.get(fname, "".join([self.base_url, fname]))
def load_registry(self, fname: PathType) -> None:
"""
Load entries from a file and add them to the registry.
Use this if you are managing many files.
Each line of the file should have file name and its hash separated by
a space. Hash can specify checksum algorithm using "alg:hash" format.
In case no algorithm is provided, SHA256 is used by default.
Only one file per line is allowed. Custom download URLs for individual
files can be specified as a third element on the line. Line comments
can be added and must be prepended with ``#``.
Parameters
----------
fname : str | fileobj
Path (or open file object) to the registry file.
"""
with contextlib.ExitStack() as stack:
if hasattr(fname, "read"):
# It's a file object
fin: Any = fname
else:
# It's a file path
fin = stack.enter_context(open(fname, encoding="utf-8"))
for linenum, line in enumerate(fin):
if isinstance(line, bytes):
line = line.decode("utf-8")
line = line.strip()
# skip line comments
if line.startswith("#"):
continue
elements = shlex.split(line)
if not len(elements) in [0, 2, 3]:
raise OSError(
f"Invalid entry in Pooch registry file '{fname}': "
f"expected 2 or 3 elements in line {linenum + 1} but got "
f"{len(elements)}. Offending entry: '{line}'"
)
if elements:
file_name = elements[0]
file_checksum = elements[1]
if len(elements) == 3:
file_url = elements[2]
self.urls[file_name] = file_url
self.registry[file_name] = file_checksum.lower()
def load_registry_from_doi(self) -> None:
"""
Populate the registry using the data repository API
Fill the registry with all the files available in the data repository,
along with their hashes. It will make a request to the data repository
API to retrieve this information. No file is downloaded during this
process.
.. important::
This method is intended to be used only when the ``base_url`` is
a DOI.
"""
# Ensure that this is indeed a DOI-based pooch
downloader = choose_downloader(self.base_url)
if not isinstance(downloader, DOIDownloader):
raise ValueError(
f"Invalid base_url '{self.base_url}': "
+ "Pooch.load_registry_from_doi is only implemented for DOIs"
)
# Create a repository instance
doi = self.base_url.replace("doi:", "")
repository = doi_to_repository(
doi,
headers=downloader.headers,
timeout=downloader.timeout,
**downloader.kwargs,
)
# Call registry population for this repository
return repository.populate_registry(self)
def is_available(self, fname: str, downloader: Optional[Downloader] = None):
"""
Check availability of a remote file without downloading it.
Use this method when working with large files to check if they are
available for download.
Parameters
----------
fname : str
The file name (relative to the *base_url* of the remote data
storage).
downloader : None or callable
If not None, then a function (or callable object) that will be
called to check the availability of the file on the server. See
:ref:`downloaders` for details.
Returns
-------
status : bool
True if the file is available for download. False otherwise.
"""
self._assert_file_in_registry(fname)
url = self.get_url(fname)
if downloader is None:
downloader = choose_downloader(url)
try:
available = downloader(url, None, self, check_only=True)
except TypeError as error:
error_msg = (
f"Downloader '{str(downloader)}' does not support availability checks."
)
raise NotImplementedError(error_msg) from error
return available
def download_action(path: Path, known_hash: Optional[str]) -> tuple[Action, str]:
"""
Determine the action that is needed to get the file on disk.
Parameters
----------
path : PathLike
The path to the file on disk.
known_hash : str
A known hash (checksum) of the file. Will be used to verify the
download or check if an existing file needs to be updated. By default,
will assume it's a SHA256 hash. To specify a different hashing method,
prepend the hash with ``algorithm:``, for example
``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``.
Returns
-------
action, verb : str
The action that must be taken and the English verb (infinitive form of
*action*) used in the log:
* ``'download'``: File does not exist locally and must be downloaded.
* ``'update'``: File exists locally but needs to be updated.
* ``'fetch'``: File exists locally and only need to inform its path.
"""
if not path.exists():
return "download", "Downloading"
if not hash_matches(str(path), known_hash):
return "update", "Updating"
return "fetch", "Fetching"
def stream_download(
url: str,
fname: Path,
known_hash: Optional[str],
downloader: Downloader,
pooch: Optional[Pooch] = None,
retry_if_failed: int = 0,
) -> None:
"""
Stream the file and check that its hash matches the known one.
The file is first downloaded to a temporary file name in the cache folder.
It will be moved to the desired file name only if the hash matches the
known hash. Otherwise, the temporary file is deleted.
If the download fails for either a bad connection or a hash mismatch, we
will retry the download the specified number of times in case the failure
was due to a network error.
"""
# Lazy import requests to speed up import time
import requests.exceptions # pylint: disable=C0415
# Ensure the parent directory exists in case the file is in a subdirectory.
# Otherwise, move will cause an error.
if not fname.parent.exists():
os.makedirs(str(fname.parent))
download_attempts = 1 + retry_if_failed
max_wait = 10
for i in range(download_attempts):
try:
# Stream the file to a temporary so that we can safely check its
# hash before overwriting the original.
with temporary_file(path=str(fname.parent)) as tmp:
downloader(url, tmp, pooch)
hash_matches(tmp, known_hash, strict=True, source=str(fname.name))
shutil.move(tmp, str(fname))
break
except (ValueError, requests.exceptions.RequestException):
if i == download_attempts - 1:
raise
retries_left = download_attempts - (i + 1)
get_logger().info(
"Failed to download '%s'. "
"Will attempt the download again %d more time%s.",
str(fname.name),
retries_left,
"s" if retries_left > 1 else "",
)
time.sleep(min(i + 1, max_wait))

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,228 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Calculating and checking file hashes.
"""
import hashlib
import functools
from pathlib import Path
# From the docs: https://docs.python.org/3/library/hashlib.html#hashlib.new
# The named constructors are much faster than new() and should be
# preferred.
# Need to fallback on new() for some algorithms.
ALGORITHMS_AVAILABLE = {
alg: getattr(hashlib, alg, functools.partial(hashlib.new, alg))
for alg in hashlib.algorithms_available
}
try:
import xxhash
# xxhash doesn't have a list of available algorithms yet.
# https://github.com/ifduyue/python-xxhash/issues/48
ALGORITHMS_AVAILABLE.update(
{
alg: getattr(xxhash, alg, None)
for alg in ["xxh128", "xxh64", "xxh32", "xxh3_128", "xxh3_64"]
}
)
# The xxh3 algorithms are only available for version>=2.0. Set to None and
# remove to ensure backwards compatibility.
ALGORITHMS_AVAILABLE = {
alg: func for alg, func in ALGORITHMS_AVAILABLE.items() if func is not None
}
except ImportError:
pass
def file_hash(fname, alg="sha256"):
"""
Calculate the hash of a given file.
Useful for checking if a file has changed or been corrupted.
Parameters
----------
fname : str
The name of the file.
alg : str
The type of the hashing algorithm
Returns
-------
hash : str
The hash of the file.
Examples
--------
>>> fname = "test-file-for-hash.txt"
>>> with open(fname, "w") as f:
... __ = f.write("content of the file")
>>> print(file_hash(fname))
0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
>>> import os
>>> os.remove(fname)
"""
if alg not in ALGORITHMS_AVAILABLE:
raise ValueError(
f"Algorithm '{alg}' not available to the pooch library. "
"Only the following algorithms are available "
f"{list(ALGORITHMS_AVAILABLE.keys())}."
)
# Calculate the hash in chunks to avoid overloading the memory
chunksize = 65536
# For hashlib algorithms, use usedforsecurity=False to support FIPS-enabled
# systems. xxhash algorithms don't support this parameter.
hasher = (
ALGORITHMS_AVAILABLE[alg](usedforsecurity=False)
if alg in hashlib.algorithms_available
else ALGORITHMS_AVAILABLE[alg]()
)
with open(fname, "rb") as fin:
buff = fin.read(chunksize)
while buff:
hasher.update(buff)
buff = fin.read(chunksize)
return hasher.hexdigest()
def hash_algorithm(hash_string):
"""
Parse the name of the hash method from the hash string.
The hash string should have the following form ``algorithm:hash``, where
algorithm can be the name of any algorithm known to :mod:`hashlib`.
If the algorithm is omitted or the hash string is None, will default to
``"sha256"``.
Parameters
----------
hash_string : str
The hash string with optional algorithm prepended.
Returns
-------
hash_algorithm : str
The name of the algorithm.
Examples
--------
>>> print(hash_algorithm("qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
sha256
>>> print(hash_algorithm("md5:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
md5
>>> print(hash_algorithm("sha256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
sha256
>>> print(hash_algorithm("SHA256:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
sha256
>>> print(hash_algorithm("xxh3_64:qouuwhwd2j192y1lb1iwgowdj2898wd2d9"))
xxh3_64
>>> print(hash_algorithm(None))
sha256
"""
default = "sha256"
if hash_string is None:
algorithm = default
elif ":" not in hash_string:
algorithm = default
else:
algorithm = hash_string.split(":")[0]
return algorithm.lower()
def hash_matches(fname, known_hash, strict=False, source=None):
"""
Check if the hash of a file matches a known hash.
If the *known_hash* is None, will always return True.
Coverts hashes to lowercase before comparison to avoid system specific
mismatches between hashes in the registry and computed hashes.
Parameters
----------
fname : str or PathLike
The path to the file.
known_hash : str
The known hash. Optionally, prepend ``alg:`` to the hash to specify the
hashing algorithm. Default is SHA256.
strict : bool
If True, will raise a :class:`ValueError` if the hash does not match
informing the user that the file may be corrupted.
source : str
The source of the downloaded file (name or URL, for example). Will be
used in the error message if *strict* is True. Has no other use other
than reporting to the user where the file came from in case of hash
mismatch. If None, will default to *fname*.
Returns
-------
is_same : bool
True if the hash matches, False otherwise.
"""
if known_hash is None:
return True
algorithm = hash_algorithm(known_hash)
new_hash = file_hash(fname, alg=algorithm)
matches = new_hash.lower() == known_hash.split(":")[-1].lower()
if strict and not matches:
if source is None:
source = str(fname)
raise ValueError(
f"{algorithm.upper()} hash of downloaded file ({source}) does not match"
f" the known hash: expected {known_hash} but got {new_hash}. Deleted"
" download for safety. The downloaded file may have been corrupted or"
" the known hash may be outdated."
)
return matches
def make_registry(directory, output, recursive=True):
"""
Make a registry of files and hashes for the given directory.
This is helpful if you have many files in your test dataset as it keeps you
from needing to manually update the registry.
Parameters
----------
directory : str
Directory of the test data to put in the registry. All file names in
the registry will be relative to this directory.
output : str
Name of the output registry file.
recursive : bool
If True, will recursively look for files in subdirectories of
*directory*.
"""
directory = Path(directory)
if recursive:
pattern = "**/*"
else:
pattern = "*"
files = sorted(
str(path.relative_to(directory))
for path in directory.glob(pattern)
if path.is_file()
)
hashes = [file_hash(str(directory / fname)) for fname in files]
with open(output, "w", encoding="utf-8") as outfile:
for fname, fhash in zip(files, hashes):
# Only use Unix separators for the registry so that we don't go
# insane dealing with file paths.
outfile.write("{} {}\n".format(fname.replace("\\", "/"), fhash))

View File

@@ -0,0 +1,415 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
# pylint: disable=line-too-long
"""
Post-processing hooks
"""
import abc
import os
import bz2
import gzip
import lzma
import shutil
import sys
from zipfile import ZipFile
from tarfile import TarFile
from .utils import get_logger
class ExtractorProcessor(abc.ABC): # pylint: disable=too-few-public-methods
"""
Abstract base class for extractions from compressed archives.
Subclasses can be used with :meth:`pooch.Pooch.fetch` and
:func:`pooch.retrieve` to unzip a downloaded data file into a folder in the
local data store. :meth:`~pooch.Pooch.fetch` will return a list with the
names of the extracted files instead of the archive.
Parameters
----------
members : list or None
If None, will unpack all files in the archive. Otherwise, *members*
must be a list of file names to unpack from the archive. Only these
files will be unpacked.
extract_dir : str or None
If None, files will be unpacked to the default location (a folder in
the same location as the downloaded zip file, with a suffix added).
Otherwise, files will be unpacked to ``extract_dir``, which is
interpreted as a *relative path* (relative to the cache location
provided by :func:`pooch.retrieve` or :meth:`pooch.Pooch.fetch`).
"""
def __init__(self, members=None, extract_dir=None):
self.members = members
self.extract_dir = extract_dir
@property
@abc.abstractmethod
def suffix(self):
"""
String appended to unpacked archive folder name.
Only used if extract_dir is None.
MUST BE IMPLEMENTED BY CHILD CLASSES.
"""
@abc.abstractmethod
def _all_members(self, fname):
"""
Return all the members in the archive.
MUST BE IMPLEMENTED BY CHILD CLASSES.
"""
@abc.abstractmethod
def _extract_file(self, fname, extract_dir):
"""
This method receives an argument for the archive to extract and the
destination path.
MUST BE IMPLEMENTED BY CHILD CLASSES.
"""
def __call__(self, fname, action, pooch):
"""
Extract all files from the given archive.
Parameters
----------
fname : str
Full path of the zipped file in local storage.
action : str
Indicates what action was taken by :meth:`pooch.Pooch.fetch` or
:func:`pooch.retrieve`:
* ``"download"``: File didn't exist locally and was downloaded
* ``"update"``: Local file was outdated and was re-download
* ``"fetch"``: File exists and is updated so it wasn't downloaded
pooch : :class:`pooch.Pooch`
The instance of :class:`pooch.Pooch` that is calling this.
Returns
-------
fnames : list of str
A list of the full path to all files in the extracted archive.
"""
if self.extract_dir is None:
self.extract_dir = fname + self.suffix
else:
archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0]
self.extract_dir = os.path.join(archive_dir, self.extract_dir)
# Get a list of everyone who is supposed to be in the unpacked folder
# so we can check if they are all there or if we need to extract new
# files.
if self.members is None or not self.members:
members = self._all_members(fname)
else:
members = self.members
if (
(action in ("update", "download"))
or (not os.path.exists(self.extract_dir))
or not all(
os.path.exists(os.path.join(self.extract_dir, m)) for m in members
)
):
# Make sure that the folder with the extracted files exists
os.makedirs(self.extract_dir, exist_ok=True)
self._extract_file(fname, self.extract_dir)
# Get a list of all file names (including subdirectories) in our folder
# of unzipped files, filtered by the given members list
fnames = []
for path, _, files in os.walk(self.extract_dir):
for filename in files:
relpath = os.path.normpath(
os.path.join(os.path.relpath(path, self.extract_dir), filename)
)
if self.members is None or any(
relpath.startswith(os.path.normpath(m)) for m in self.members
):
fnames.append(os.path.join(path, filename))
return fnames
class Unzip(ExtractorProcessor): # pylint: disable=too-few-public-methods
"""
Processor that unpacks a zip archive and returns a list of all files.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to unzip a
downloaded data file into a folder in the local data store. The
method/function will return a list with the names of the unzipped files
instead of the zip archive.
The output folder is ``{fname}.unzip``.
Parameters
----------
members : list or None
If None, will unpack all files in the zip archive. Otherwise, *members*
must be a list of file names to unpack from the archive. Only these
files will be unpacked.
extract_dir : str or None
If None, files will be unpacked to the default location (a folder in
the same location as the downloaded zip file, with the suffix
``.unzip`` added). Otherwise, files will be unpacked to
``extract_dir``, which is interpreted as a *relative path* (relative to
the cache location provided by :func:`pooch.retrieve` or
:meth:`pooch.Pooch.fetch`).
"""
@property
def suffix(self):
"""
String appended to unpacked archive folder name.
Only used if extract_dir is None.
"""
return ".unzip"
def _all_members(self, fname):
"""Return all members from a given archive."""
with ZipFile(fname, "r") as zip_file:
return zip_file.namelist()
def _extract_file(self, fname, extract_dir):
"""
This method receives an argument for the archive to extract and the
destination path.
"""
with ZipFile(fname, "r") as zip_file:
if self.members is None:
get_logger().info(
"Unzipping contents of '%s' to '%s'", fname, extract_dir
)
# Unpack all files from the archive into our new folder
zip_file.extractall(path=extract_dir)
else:
for member in self.members:
get_logger().info(
"Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
)
# If the member is a dir, we need to get the names of the
# elements it contains for extraction (ZipFile does not
# support dirs on .extract). If it's not a dir, this will
# only include the member itself.
# Based on:
# https://stackoverflow.com/questions/8008829/extract-only-a-single-directory-from-tar
subdir_members = [
name
for name in zip_file.namelist()
if os.path.normpath(name).startswith(os.path.normpath(member))
]
# Extract the data file from within the archive
zip_file.extractall(members=subdir_members, path=extract_dir)
class Untar(ExtractorProcessor): # pylint: disable=too-few-public-methods
"""
Processor that unpacks a tar archive and returns a list of all files.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to untar a
downloaded data file into a folder in the local data store. The
method/function will return a list with the names of the extracted files
instead of the archive.
The output folder is ``{fname}.untar``.
Parameters
----------
members : list or None
If None, will unpack all files in the archive. Otherwise, *members*
must be a list of file names to unpack from the archive. Only these
files will be unpacked.
extract_dir : str or None
If None, files will be unpacked to the default location (a folder in
the same location as the downloaded tar file, with the suffix
``.untar`` added). Otherwise, files will be unpacked to
``extract_dir``, which is interpreted as a *relative path* (relative to
the cache location provided by :func:`pooch.retrieve` or
:meth:`pooch.Pooch.fetch`).
"""
@property
def suffix(self):
"""
String appended to unpacked archive folder name.
Only used if extract_dir is None.
"""
return ".untar"
def _all_members(self, fname):
"""Return all members from a given archive."""
with TarFile.open(fname, "r") as tar_file:
return [info.name for info in tar_file.getmembers()]
def _extract_file(self, fname, extract_dir):
"""
This method receives an argument for the archive to extract and the
destination path.
"""
filter_kwarg = {} if sys.version_info < (3, 12) else {"filter": "data"}
with TarFile.open(fname, "r") as tar_file:
if self.members is None:
get_logger().info(
"Untarring contents of '%s' to '%s'", fname, extract_dir
)
# Unpack all files from the archive into our new folder
tar_file.extractall(path=extract_dir, **filter_kwarg)
else:
for member in self.members:
get_logger().info(
"Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
)
# If the member is a dir, we need to get the names of the
# elements it contains for extraction (TarFile does not
# support dirs on .extract). If it's not a dir, this will
# only include the member itself.
# Based on:
# https://stackoverflow.com/questions/8008829/extract-only-a-single-directory-from-tar
# Can't use .getnames because extractall expects TarInfo
# objects.
subdir_members = [
info
for info in tar_file.getmembers()
if os.path.normpath(info.name).startswith(
os.path.normpath(member)
)
]
# Extract the data file from within the archive
tar_file.extractall(
members=subdir_members, path=extract_dir, **filter_kwarg
)
class Decompress: # pylint: disable=too-few-public-methods
"""
Processor that decompress a file and returns the decompressed version.
Use with :meth:`pooch.Pooch.fetch` or :func:`pooch.retrieve` to decompress
a downloaded data file so that it can be easily opened. Useful for data
files that take a long time to decompress (exchanging disk space for
speed).
Supported decompression methods are LZMA (``.xz``), bzip2 (``.bz2``), and
gzip (``.gz``).
File names with the standard extensions (see above) can use
``method="auto"`` to automatically determine the compression method. This
can be overwritten by setting the *method* argument.
.. note::
To unpack zip and tar archives with one or more files, use
:class:`pooch.Unzip` and :class:`pooch.Untar` instead.
The output file is ``{fname}.decomp`` by default but it can be changed by
setting the ``name`` parameter.
.. warning::
Passing in ``name`` can cause existing data to be lost! For example, if
a file already exists with the specified name it will be overwritten
with the new decompressed file content. **Use this option with
caution.**
Parameters
----------
method : str
Name of the compression method. Can be "auto", "lzma", "xz", "bzip2",
or "gzip".
name : None or str
Defines the decompressed file name. The file name will be
``{fname}.decomp`` if ``None`` (default) or the given name otherwise.
Note that the name should **not** include the full (or relative) path,
it should be just the file name itself.
"""
modules = {"auto": None, "lzma": lzma, "xz": lzma, "gzip": gzip, "bzip2": bz2}
extensions = {".xz": "lzma", ".gz": "gzip", ".bz2": "bzip2"}
def __init__(self, method="auto", name=None):
self.method = method
self.name = name
def __call__(self, fname, action, pooch):
"""
Decompress the given file.
The output file will be either ``{fname}.decomp`` or the given *name*
class attribute.
Parameters
----------
fname : str
Full path of the compressed file in local storage.
action : str
Indicates what action was taken by :meth:`pooch.Pooch.fetch` or
:func:`pooch.retrieve`:
- ``"download"``: File didn't exist locally and was downloaded
- ``"update"``: Local file was outdated and was re-download
- ``"fetch"``: File exists and is updated so it wasn't downloaded
pooch : :class:`pooch.Pooch`
The instance of :class:`pooch.Pooch` that is calling this.
Returns
-------
fname : str
The full path to the decompressed file.
"""
if self.name is None:
decompressed = fname + ".decomp"
else:
decompressed = os.path.join(os.path.dirname(fname), self.name)
if action in ("update", "download") or not os.path.exists(decompressed):
get_logger().info(
"Decompressing '%s' to '%s' using method '%s'.",
fname,
decompressed,
self.method,
)
module = self._compression_module(fname)
with open(decompressed, "w+b") as output:
with module.open(fname) as compressed:
shutil.copyfileobj(compressed, output)
return decompressed
def _compression_module(self, fname):
"""
Get the Python module compatible with fname and the chosen method.
If the *method* attribute is "auto", will select a method based on the
extension. If no recognized extension is in the file name, will raise a
ValueError.
"""
error_archives = "To unpack zip/tar archives, use pooch.Unzip/Untar instead."
if self.method not in self.modules:
message = (
f"Invalid compression method '{self.method}'. "
f"Must be one of '{list(self.modules.keys())}'."
)
if self.method in {"zip", "tar"}:
message = " ".join([message, error_archives])
raise ValueError(message)
if self.method == "auto":
ext = os.path.splitext(fname)[-1]
if ext not in self.extensions:
message = (
f"Unrecognized file extension '{ext}'. "
f"Must be one of '{list(self.extensions.keys())}'."
)
if ext in {".zip", ".tar"}:
message = " ".join([message, error_archives])
raise ValueError(message)
return self.modules[self.extensions[ext]]
return self.modules[self.method]

View File

@@ -0,0 +1,6 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,10 @@
subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
tiny-data.zip 0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb
large-data.txt 98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a
store.zip 0498d2a001e71051bbd2acd2346f38da7cbd345a633cb7bf0f8a20938714b51a
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d https://some-site/tiny-data.txt
tiny-data.tar.gz 41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b
store.tar.gz 088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511
tiny-data.txt.bz2 753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306
tiny-data.txt.gz 2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52
tiny-data.txt.xz 99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765

View File

@@ -0,0 +1,2 @@
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
some-file.txt second_element third_element forth_element

View File

@@ -0,0 +1,2 @@
"file with spaces.txt" baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
other\ with\ spaces.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d

View File

@@ -0,0 +1,12 @@
subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
large-data.txt 98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a
tiny-data.zip 0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb
store.zip 0498D2A001E71051BBD2ACD2346F38DA7CBD345A633CB7BF0F8A20938714B51A
tiny-data.tar.gz 41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b
store.tar.gz 088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511
tiny-data.txt.bz2 753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306
tiny-data.txt.gz 2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52
tiny-data.txt.xz 99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765

View File

@@ -0,0 +1,14 @@
# a comment
subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d
large-data.txt 98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a
tiny-data.zip 0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb
# a comment with a starting space
store.zip 0498d2a001e71051bbd2acd2346f38da7cbd345a633cb7bf0f8a20938714b51a
tiny-data.tar.gz 41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b
store.tar.gz 088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511
tiny-data.txt.bz2 753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306
tiny-data.txt.gz 2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52
tiny-data.txt.xz 99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765

View File

@@ -0,0 +1,2 @@
# A tiny data file for test purposes only
1 2 3 4 5 6

View File

@@ -0,0 +1,2 @@
# A tiny data file for test purposes only
1 2 3 4 5 6

View File

@@ -0,0 +1,2 @@
# A tiny data file for test purposes only
1 2 3 4 5 6

View File

@@ -0,0 +1,689 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
# pylint: disable=redefined-outer-name
"""
Test the core class and factory function.
"""
import hashlib
import os
from pathlib import Path
from tempfile import TemporaryDirectory
import pytest
from ..core import create, Pooch, retrieve, download_action, stream_download
from ..utils import get_logger, temporary_file, os_cache
from ..hashes import file_hash, hash_matches
# Import the core module so that we can monkeypatch some functions
from .. import core
from ..downloaders import HTTPDownloader, FTPDownloader
from .utils import (
pooch_test_url,
data_over_ftp,
pooch_test_figshare_url,
pooch_test_zenodo_url,
pooch_test_zenodo_with_slash_url,
pooch_test_dataverse_url,
pooch_test_registry,
check_tiny_data,
check_large_data,
capture_log,
mirror_directory,
)
DATA_DIR = str(Path(__file__).parent / "data")
REGISTRY = pooch_test_registry()
BASEURL = pooch_test_url()
FIGSHAREURL = pooch_test_figshare_url()
ZENODOURL = pooch_test_zenodo_url()
ZENODOURL_W_SLASH = pooch_test_zenodo_with_slash_url()
DATAVERSEURL = pooch_test_dataverse_url()
REGISTRY_CORRUPTED = {
# The same data file but I changed the hash manually to a wrong one
"tiny-data.txt": "098h0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d"
}
@pytest.fixture
def data_dir_mirror(tmp_path):
"""
Mirror the test data folder on a temporary directory. Needed to avoid
permission errors when pooch is installed on a non-writable path.
"""
return mirror_directory(DATA_DIR, tmp_path)
@pytest.mark.network
def test_retrieve():
"Try downloading some data with retrieve"
with TemporaryDirectory() as local_store:
data_file = "tiny-data.txt"
url = BASEURL + data_file
# Check that the logs say that the file is being downloaded
with capture_log() as log_file:
fname = retrieve(url, known_hash=None, path=local_store)
logs = log_file.getvalue()
assert logs.split()[0] == "Downloading"
assert "SHA256 hash of downloaded file:" in logs
assert REGISTRY[data_file] in logs
# Check that the downloaded file has the right content
assert data_file == fname[-len(data_file) :]
check_tiny_data(fname)
assert file_hash(fname) == REGISTRY[data_file]
# Check that no logging happens when not downloading
with capture_log() as log_file:
fname = retrieve(url, known_hash=None, path=local_store)
assert log_file.getvalue() == ""
with capture_log() as log_file:
fname = retrieve(url, known_hash=REGISTRY[data_file], path=local_store)
assert log_file.getvalue() == ""
@pytest.mark.network
def test_retrieve_fname():
"Try downloading some data with retrieve and setting the file name"
with TemporaryDirectory() as local_store:
data_file = "tiny-data.txt"
url = BASEURL + data_file
# Check that the logs say that the file is being downloaded
with capture_log() as log_file:
fname = retrieve(url, known_hash=None, path=local_store, fname=data_file)
logs = log_file.getvalue()
assert logs.split()[0] == "Downloading"
assert "SHA256 hash of downloaded file:" in logs
assert REGISTRY[data_file] in logs
# Check that the downloaded file has the right name and content
assert data_file == os.path.split(fname)[1]
check_tiny_data(fname)
assert file_hash(fname) == REGISTRY[data_file]
@pytest.mark.network
def test_retrieve_default_path():
"Try downloading some data with retrieve to the default cache location"
data_file = "tiny-data.txt"
url = BASEURL + data_file
expected_location = os_cache("pooch") / data_file
try:
# Check that the logs say that the file is being downloaded
with capture_log() as log_file:
fname = retrieve(url, known_hash=None, fname=data_file)
logs = log_file.getvalue()
assert logs.split()[0] == "Downloading"
assert str(os_cache("pooch").resolve()) in logs
assert "SHA256 hash of downloaded file" in logs
assert REGISTRY[data_file] in logs
# Check that the downloaded file has the right content
assert fname == str(expected_location.resolve())
check_tiny_data(fname)
assert file_hash(fname) == REGISTRY[data_file]
finally:
if os.path.exists(str(expected_location)):
os.remove(str(expected_location))
def test_pooch_local(data_dir_mirror):
"Setup a pooch that already has the local data and test the fetch."
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry=REGISTRY)
true = str(data_dir_mirror / "tiny-data.txt")
fname = pup.fetch("tiny-data.txt")
assert true == fname
check_tiny_data(fname)
@pytest.mark.network
@pytest.mark.parametrize(
"url",
[
BASEURL,
pytest.param(FIGSHAREURL, marks=pytest.mark.figshare),
ZENODOURL,
DATAVERSEURL,
],
ids=["https", "figshare", "zenodo", "dataverse"],
)
def test_pooch_custom_url(url):
"Have pooch download the file from URL that is not base_url"
with TemporaryDirectory() as local_store:
path = Path(local_store)
urls = {"tiny-data.txt": url + "tiny-data.txt"}
# Setup a pooch in a temp dir
pup = Pooch(path=path, base_url="", registry=REGISTRY, urls=urls)
# Check that the logs say that the file is being downloaded
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt")
logs = log_file.getvalue()
assert logs.split()[0] == "Downloading"
assert logs.split()[-1] == f"'{path}'."
check_tiny_data(fname)
# Check that no logging happens when there are no events
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt")
assert log_file.getvalue() == ""
@pytest.mark.network
@pytest.mark.parametrize(
"url",
[
BASEURL,
pytest.param(FIGSHAREURL, marks=pytest.mark.figshare),
ZENODOURL,
DATAVERSEURL,
],
ids=["https", "figshare", "zenodo", "dataverse"],
)
def test_pooch_download(url):
"Setup a pooch that has no local data and needs to download"
with TemporaryDirectory() as local_store:
path = Path(local_store)
true_path = str(path / "tiny-data.txt")
# Setup a pooch in a temp dir
pup = Pooch(path=path, base_url=url, registry=REGISTRY)
# Check that the logs say that the file is being downloaded
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt")
logs = log_file.getvalue()
assert logs.split()[0] == "Downloading"
assert logs.split()[-1] == f"'{path}'."
# Check that the downloaded file has the right content
assert true_path == fname
check_tiny_data(fname)
assert file_hash(fname) == REGISTRY["tiny-data.txt"]
# Check that no logging happens when not downloading
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt")
assert log_file.getvalue() == ""
class FakeHashMatches: # pylint: disable=too-few-public-methods
"Create a fake version of hash_matches that fails n times"
def __init__(self, nfailures):
self.nfailures = nfailures
self.failed = 0
def hash_matches(self, *args, **kwargs):
"Fail n times before finally passing"
if self.failed < self.nfailures:
self.failed += 1
# Give it an invalid hash to force a failure
return hash_matches(args[0], "bla", **kwargs)
return hash_matches(*args, **kwargs)
@pytest.mark.network
def test_pooch_download_retry_off_by_default(monkeypatch):
"Check that retrying the download is off by default"
with TemporaryDirectory() as local_store:
monkeypatch.setattr(core, "hash_matches", FakeHashMatches(3).hash_matches)
# Setup a pooch without download retrying
path = Path(local_store)
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
# Make sure it fails with no retries
with pytest.raises(ValueError) as error:
with capture_log() as log_file:
pup.fetch("tiny-data.txt")
assert "does not match the known hash" in str(error)
# Check that the log doesn't have the download retry message
logs = log_file.getvalue().strip().split("\n")
assert len(logs) == 1
assert logs[0].startswith("Downloading")
assert logs[0].endswith(f"'{path}'.")
class FakeSleep: # pylint: disable=too-few-public-methods
"Create a fake version of sleep that logs the specified times"
def __init__(self):
self.times = []
def sleep(self, secs):
"Store the time and doesn't sleep"
self.times.append(secs)
@pytest.mark.network
def test_pooch_download_retry(monkeypatch):
"Check that retrying the download works if the hash is different"
with TemporaryDirectory() as local_store:
monkeypatch.setattr(core, "hash_matches", FakeHashMatches(11).hash_matches)
fakesleep = FakeSleep()
monkeypatch.setattr(core.time, "sleep", fakesleep.sleep)
# Setup a pooch with download retrying
path = Path(local_store)
true_path = str(path / "tiny-data.txt")
retries = 11
pup = Pooch(
path=path, base_url=BASEURL, registry=REGISTRY, retry_if_failed=retries
)
# Check that the logs say that the download failed n times
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt")
logs = log_file.getvalue().strip().split("\n")
assert len(logs) == 1 + retries
assert logs[0].startswith("Downloading")
assert logs[0].endswith(f"'{path}'.")
for i, line in zip(range(retries, 0, -1), logs[1:]):
assert "Failed to download" in line
plural = "s" if i > 1 else ""
assert f"download again {i} more time{plural}." in line
# Check that the sleep time increases but stops at 10s
assert fakesleep.times == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10]
# Check that the downloaded file has the right content
assert true_path == fname
check_tiny_data(fname)
assert file_hash(fname) == REGISTRY["tiny-data.txt"]
@pytest.mark.network
def test_pooch_download_retry_fails_eventually(monkeypatch):
"Check that retrying the download fails after the set amount of retries"
with TemporaryDirectory() as local_store:
monkeypatch.setattr(core, "hash_matches", FakeHashMatches(3).hash_matches)
# Setup a pooch with insufficient retry attempts
path = Path(local_store)
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY, retry_if_failed=1)
# Make sure it fails with no retries
with pytest.raises(ValueError) as error:
# Check that the logs say that the download failed n times
with capture_log() as log_file:
pup.fetch("tiny-data.txt")
logs = log_file.getvalue().strip().split("\n")
assert len(logs) == 2
assert logs[0].startswith("Downloading")
assert logs[0].endswith(f"'{path}'.")
assert "Failed to download" in logs[1]
assert "download again 1 more time." in logs[1]
assert "does not match the known hash" in str(error)
@pytest.mark.network
def test_pooch_logging_level():
"Setup a pooch and check that no logging happens when the level is raised"
with TemporaryDirectory() as local_store:
path = Path(local_store)
urls = {"tiny-data.txt": BASEURL + "tiny-data.txt"}
# Setup a pooch in a temp dir
pup = Pooch(path=path, base_url="", registry=REGISTRY, urls=urls)
# Capture only critical logging events
with capture_log("CRITICAL") as log_file:
fname = pup.fetch("tiny-data.txt")
assert log_file.getvalue() == ""
check_tiny_data(fname)
@pytest.mark.network
def test_pooch_update():
"Setup a pooch that already has the local data but the file is outdated"
with TemporaryDirectory() as local_store:
path = Path(local_store)
# Create a dummy version of tiny-data.txt that is different from the
# one in the remote storage
true_path = str(path / "tiny-data.txt")
with open(true_path, "w", encoding="utf-8") as fin:
fin.write("different data")
# Setup a pooch in a temp dir
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
# Check that the logs say that the file is being updated
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt")
logs = log_file.getvalue()
assert logs.split()[0] == "Updating"
assert logs.split()[-1] == f"'{path}'."
# Check that the updated file has the right content
assert true_path == fname
check_tiny_data(fname)
assert file_hash(fname) == REGISTRY["tiny-data.txt"]
# Check that no logging happens when not downloading
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt")
assert log_file.getvalue() == ""
def test_pooch_update_disallowed():
"Test that disallowing updates works."
with TemporaryDirectory() as local_store:
path = Path(local_store)
# Create a dummy version of tiny-data.txt that is different from the
# one in the remote storage
true_path = str(path / "tiny-data.txt")
with open(true_path, "w", encoding="utf-8") as fin:
fin.write("different data")
# Setup a pooch in a temp dir
pup = Pooch(
path=path,
base_url=BASEURL,
registry=REGISTRY,
allow_updates=False,
)
with pytest.raises(ValueError):
pup.fetch("tiny-data.txt")
def test_pooch_update_disallowed_environment():
"Test that disallowing updates works through an environment variable."
variable_name = "MYPROJECT_DISALLOW_UPDATES"
try:
os.environ[variable_name] = "False"
with TemporaryDirectory() as local_store:
path = Path(local_store)
# Create a dummy version of tiny-data.txt that is different from
# the one in the remote storage
true_path = str(path / "tiny-data.txt")
with open(true_path, "w", encoding="utf-8") as fin:
fin.write("different data")
# Setup a pooch in a temp dir
pup = create(
path=path,
base_url=BASEURL,
registry=REGISTRY,
allow_updates=variable_name,
)
with pytest.raises(ValueError):
pup.fetch("tiny-data.txt")
finally:
os.environ.pop(variable_name)
def test_pooch_create_base_url_no_trailing_slash():
"""
Test if pooch.create appends a trailing slash to the base url if missing
"""
base_url = "https://mybase.url"
pup = create(base_url=base_url, registry=None, path=DATA_DIR)
assert pup.base_url == base_url + "/"
@pytest.mark.network
def test_pooch_corrupted(data_dir_mirror):
"Raise an exception if the file hash doesn't match the registry"
# Test the case where the file wasn't in the directory
with TemporaryDirectory() as local_store:
path = os.path.abspath(local_store)
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY_CORRUPTED)
with capture_log() as log_file:
with pytest.raises(ValueError) as error:
pup.fetch("tiny-data.txt")
assert "(tiny-data.txt)" in str(error.value)
logs = log_file.getvalue()
assert logs.split()[0] == "Downloading"
assert logs.split()[-1] == f"'{path}'."
# and the case where the file exists but hash doesn't match
pup = Pooch(path=data_dir_mirror, base_url=BASEURL, registry=REGISTRY_CORRUPTED)
with capture_log() as log_file:
with pytest.raises(ValueError) as error:
pup.fetch("tiny-data.txt")
assert "(tiny-data.txt)" in str(error.value)
logs = log_file.getvalue()
assert logs.split()[0] == "Updating"
assert logs.split()[-1] == f"'{data_dir_mirror}'."
def test_pooch_file_not_in_registry():
"Should raise an exception if the file is not in the registry."
pup = Pooch(
path="it shouldn't matter", base_url="this shouldn't either", registry=REGISTRY
)
with pytest.raises(ValueError):
pup.fetch("this-file-does-not-exit.csv")
def test_pooch_load_registry():
"Loading the registry from a file should work"
pup = Pooch(path="", base_url="")
pup.load_registry(os.path.join(DATA_DIR, "registry.txt"))
assert pup.registry == REGISTRY
assert pup.registry_files.sort() == list(REGISTRY).sort()
def test_pooch_load_registry_comments():
"Loading the registry from a file and strip line comments"
pup = Pooch(path="", base_url="")
pup.load_registry(os.path.join(DATA_DIR, "registry_comments.txt"))
assert pup.registry == REGISTRY
assert pup.registry_files.sort() == list(REGISTRY).sort()
def test_pooch_load_registry_fileobj():
"Loading the registry from a file object"
path = os.path.join(DATA_DIR, "registry.txt")
# Binary mode
pup = Pooch(path="", base_url="")
with open(path, "rb") as fin:
pup.load_registry(fin)
assert pup.registry == REGISTRY
assert pup.registry_files.sort() == list(REGISTRY).sort()
# Text mode
pup = Pooch(path="", base_url="")
with open(path, "r", encoding="utf-8") as fin:
pup.load_registry(fin)
assert pup.registry == REGISTRY
assert pup.registry_files.sort() == list(REGISTRY).sort()
def test_pooch_load_registry_custom_url():
"Load the registry from a file with a custom URL inserted"
pup = Pooch(path="", base_url="")
pup.load_registry(os.path.join(DATA_DIR, "registry-custom-url.txt"))
assert pup.registry == REGISTRY
assert pup.urls == {"tiny-data.txt": "https://some-site/tiny-data.txt"}
def test_pooch_load_registry_invalid_line():
"Should raise an exception when a line doesn't have two elements"
pup = Pooch(path="", base_url="", registry={})
with pytest.raises(IOError):
pup.load_registry(os.path.join(DATA_DIR, "registry-invalid.txt"))
def test_pooch_load_registry_with_spaces():
"Should check that spaces in filenames are allowed in registry files"
pup = Pooch(path="", base_url="")
pup.load_registry(os.path.join(DATA_DIR, "registry-spaces.txt"))
assert "file with spaces.txt" in pup.registry
assert "other with spaces.txt" in pup.registry
@pytest.mark.network
def test_check_availability():
"Should correctly check availability of existing and non existing files"
# Check available remote file
pup = Pooch(path=DATA_DIR, base_url=BASEURL, registry=REGISTRY)
assert pup.is_available("tiny-data.txt")
# Check non available remote file
pup = Pooch(path=DATA_DIR, base_url=BASEURL + "wrong-url/", registry=REGISTRY)
assert not pup.is_available("tiny-data.txt")
# Wrong file name
registry = {"not-a-real-data-file.txt": "notarealhash"}
registry.update(REGISTRY)
pup = Pooch(path=DATA_DIR, base_url=BASEURL, registry=registry)
assert not pup.is_available("not-a-real-data-file.txt")
def test_check_availability_on_ftp(ftpserver):
"Should correctly check availability of existing and non existing files"
with data_over_ftp(ftpserver, "tiny-data.txt") as url:
# Check available remote file on FTP server
pup = Pooch(
path=DATA_DIR,
base_url=url.replace("tiny-data.txt", ""),
registry={
"tiny-data.txt": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
"doesnot_exist.zip": "jdjdjdjdflld",
},
)
downloader = FTPDownloader(port=ftpserver.server_port)
assert pup.is_available("tiny-data.txt", downloader=downloader)
# Check non available remote file
assert not pup.is_available("doesnot_exist.zip", downloader=downloader)
def test_check_availability_invalid_downloader():
"Should raise an exception if the downloader doesn't support this"
def downloader(url, output, pooch): # pylint: disable=unused-argument
"A downloader that doesn't support check_only"
return None
pup = Pooch(path=DATA_DIR, base_url=BASEURL, registry=REGISTRY)
msg = "does not support availability checks."
with pytest.raises(NotImplementedError, match=msg):
pup.is_available("tiny-data.txt", downloader=downloader)
@pytest.mark.network
def test_fetch_with_downloader(capsys):
"Setup a downloader function for fetch"
def download(url, output_file, pup): # pylint: disable=unused-argument
"Download through HTTP and warn that we're doing it"
get_logger().info("downloader executed")
HTTPDownloader()(url, output_file, pup)
with TemporaryDirectory() as local_store:
path = Path(local_store)
# Setup a pooch in a temp dir
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
# Check that the logs say that the file is being downloaded
with capture_log() as log_file:
fname = pup.fetch("large-data.txt", downloader=download)
logs = log_file.getvalue()
lines = logs.splitlines()
assert len(lines) == 2
assert lines[0].split()[0] == "Downloading"
assert lines[1] == "downloader executed"
# Read stderr and make sure no progress bar was printed by default
assert not capsys.readouterr().err
# Check that the downloaded file has the right content
check_large_data(fname)
# Check that no logging happens when not downloading
with capture_log() as log_file:
fname = pup.fetch("large-data.txt")
assert log_file.getvalue() == ""
def test_invalid_hash_alg(data_dir_mirror):
"Test an invalid hashing algorithm"
pup = Pooch(
path=data_dir_mirror, base_url=BASEURL, registry={"tiny-data.txt": "blah:1234"}
)
with pytest.raises(ValueError) as exc:
pup.fetch("tiny-data.txt")
assert "'blah'" in str(exc.value)
def test_alternative_hashing_algorithms(data_dir_mirror):
"Test different hashing algorithms using local data"
fname = str(data_dir_mirror / "tiny-data.txt")
check_tiny_data(fname)
with open(fname, "rb") as fin:
data = fin.read()
for alg in ("sha512", "md5"):
hasher = hashlib.new(alg)
hasher.update(data)
registry = {"tiny-data.txt": f"{alg}:{hasher.hexdigest()}"}
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry=registry)
assert fname == pup.fetch("tiny-data.txt")
check_tiny_data(fname)
def test_download_action():
"Test that the right action is performed based on file existing"
action, verb = download_action(
Path("this_file_does_not_exist.txt"), known_hash=None
)
assert action == "download"
assert verb == "Downloading"
with temporary_file() as tmp:
action, verb = download_action(Path(tmp), known_hash="not the correct hash")
assert action == "update"
assert verb == "Updating"
with temporary_file() as tmp:
with open(tmp, "w", encoding="utf-8") as output:
output.write("some data")
action, verb = download_action(Path(tmp), known_hash=file_hash(tmp))
assert action == "fetch"
assert verb == "Fetching"
@pytest.mark.network
@pytest.mark.parametrize("fname", ["tiny-data.txt", "subdir/tiny-data.txt"])
def test_stream_download(fname):
"Check that downloading a file over HTTP works as expected"
# Use the data in store/ because the subdir is in there for some reason
url = BASEURL + "store/" + fname
known_hash = REGISTRY[fname]
downloader = HTTPDownloader()
with TemporaryDirectory() as local_store:
destination = Path(local_store) / fname
assert not destination.exists()
stream_download(url, destination, known_hash, downloader, pooch=None)
assert destination.exists()
check_tiny_data(str(destination))
@pytest.mark.network
@pytest.mark.parametrize(
"url",
[pytest.param(FIGSHAREURL, marks=pytest.mark.figshare), ZENODOURL, DATAVERSEURL],
ids=["figshare", "zenodo", "dataverse"],
)
def test_load_registry_from_doi(url):
"""Check that the registry is correctly populated from the API"""
with TemporaryDirectory() as local_store:
path = os.path.abspath(local_store)
pup = Pooch(path=path, base_url=url)
pup.load_registry_from_doi()
# Check the existence of all files in the registry
assert len(pup.registry) == 2
assert "tiny-data.txt" in pup.registry
assert "store.zip" in pup.registry
# Ensure that all files have correct checksums by fetching them
for filename in pup.registry:
pup.fetch(filename)
@pytest.mark.network
def test_load_registry_from_doi_zenodo_with_slash():
"""
Check that the registry is correctly populated from the Zenodo API when
the filename contains a slash
"""
url = ZENODOURL_W_SLASH
with TemporaryDirectory() as local_store:
path = os.path.abspath(local_store)
pup = Pooch(path=path, base_url=url)
pup.load_registry_from_doi()
# Check the existence of all files in the registry
assert len(pup.registry) == 1
assert "santisoler/pooch-test-data-v1.zip" in pup.registry
# Ensure that all files have correct checksums by fetching them
for filename in pup.registry:
pup.fetch(filename)
def test_wrong_load_registry_from_doi():
"""Check that non-DOI URLs produce an error"""
pup = Pooch(path="", base_url=BASEURL)
with pytest.raises(ValueError) as exc:
pup.load_registry_from_doi()
assert "only implemented for DOIs" in str(exc.value)

View File

@@ -0,0 +1,582 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Test the downloader classes and functions separately from the Pooch core.
"""
import os
import sys
from tempfile import TemporaryDirectory
import pytest
from requests import HTTPError
# Mypy doesn't like assigning None like this.
# Can just use a guard variable
try:
import tqdm
except ImportError:
tqdm = None # type: ignore
try:
import paramiko
except ImportError:
paramiko = None # type: ignore
from .. import Pooch
from ..downloaders import (
HTTPDownloader,
FTPDownloader,
SFTPDownloader,
DOIDownloader,
choose_downloader,
FigshareRepository,
ZenodoRepository,
DataverseRepository,
doi_to_url,
REQUESTS_HEADERS,
)
from ..processors import Unzip
from .utils import (
pooch_test_url,
check_large_data,
check_tiny_data,
data_over_ftp,
pooch_test_figshare_url,
pooch_test_zenodo_url,
pooch_test_zenodo_with_slash_url,
pooch_test_dataverse_url,
)
BASEURL = pooch_test_url()
FIGSHAREURL = pooch_test_figshare_url()
ZENODOURL = pooch_test_zenodo_url()
ZENODOURL_W_SLASH = pooch_test_zenodo_with_slash_url()
DATAVERSEURL = pooch_test_dataverse_url()
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
@pytest.mark.parametrize(
"url",
[
BASEURL + "tiny-data.txt", # HTTPDownloader
ZENODOURL, # DOIDownloader
],
)
def test_progressbar_kwarg_passed(url):
"""The progressbar keyword argument must pass through choose_downloader"""
downloader = choose_downloader(url, progressbar=True)
assert downloader.progressbar is True
@pytest.mark.skipif(paramiko is None, reason="requires paramiko")
def test_progressbar_kwarg_passed_sftp():
"""The progressbar keyword argument must pass through choose_downloader"""
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
downloader = choose_downloader(url, progressbar=True)
assert downloader.progressbar is True
def test_unsupported_protocol():
"Should raise ValueError when protocol is not supported"
with pytest.raises(ValueError):
choose_downloader("httpup://some-invalid-url.com")
# Simulate the DOI format
with pytest.raises(ValueError):
choose_downloader("doii:XXX/XXX/file")
@pytest.mark.network
def test_invalid_doi_repository():
"Should fail if data repository is not supported"
with pytest.raises(ValueError) as exc:
# Use the DOI of the Pooch paper in JOSS (not a data repository)
DOIDownloader()(
url="doi:10.21105/joss.01943/file_name.txt", output_file=None, pooch=None
)
assert "Invalid data repository 'joss.theoj.org'" in str(exc.value)
@pytest.mark.network
def test_doi_url_not_found():
"Should fail if the DOI is not found"
with pytest.raises(HTTPError):
doi_to_url(doi="NOTAREALDOI")
@pytest.mark.network
@pytest.mark.parametrize(
"repository,doi",
[
pytest.param(
FigshareRepository,
"10.6084/m9.figshare.14763051.v1",
marks=pytest.mark.figshare,
),
(ZenodoRepository, "10.5281/zenodo.4924875"),
(DataverseRepository, "10.11588/data/TKCFEF"),
],
ids=["figshare", "zenodo", "dataverse"],
)
def test_figshare_url_file_not_found(repository, doi):
"Should fail if the file is not found in the archive"
with pytest.raises(ValueError) as exc:
url = doi_to_url(doi)
repo = repository.initialize(doi, url)
repo.download_url(file_name="bla.txt")
assert "File 'bla.txt' not found" in str(exc.value)
@pytest.mark.network
@pytest.mark.parametrize(
"url",
[pytest.param(FIGSHAREURL, marks=pytest.mark.figshare), ZENODOURL, DATAVERSEURL],
ids=["figshare", "zenodo", "dataverse"],
)
def test_doi_downloader(url):
"Test the DOI downloader"
# Use the test data we have on the repository
with TemporaryDirectory() as local_store:
downloader = DOIDownloader()
outfile = os.path.join(local_store, "tiny-data.txt")
downloader(url + "tiny-data.txt", outfile, None)
check_tiny_data(outfile)
@pytest.mark.network
def test_zenodo_downloader_with_slash_in_fname():
"""
Test the Zenodo downloader when the path contains a forward slash
Related to issue #336
"""
# Use the test data we have on the repository
with TemporaryDirectory() as local_store:
base_url = ZENODOURL_W_SLASH + "santisoler/pooch-test-data-v1.zip"
downloader = DOIDownloader()
outfile = os.path.join(local_store, "test-data.zip")
downloader(base_url, outfile, None)
# unpack the downloaded zip file so we can check the integrity of
# tiny-data.txt
fnames = Unzip()(outfile, action="download", pooch=None)
(fname,) = [f for f in fnames if "tiny-data.txt" in f]
check_tiny_data(fname)
@pytest.mark.network
@pytest.mark.figshare
def test_figshare_unspecified_version():
"""
Test if passing a Figshare url without a version warns about it, but still
downloads it.
"""
url = FIGSHAREURL
# Remove the last bits of the doi, where the version is specified and
url = url[: url.rindex(".")] + "/"
# Create expected warning message
doi = url[4:-1]
warning_msg = f"The Figshare DOI '{doi}' doesn't specify which version of "
with TemporaryDirectory() as local_store:
downloader = DOIDownloader()
outfile = os.path.join(local_store, "tiny-data.txt")
with pytest.warns(UserWarning, match=warning_msg):
downloader(url + "tiny-data.txt", outfile, None)
@pytest.mark.network
@pytest.mark.figshare
@pytest.mark.parametrize(
"version, missing, present",
[
(
1,
"LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz",
"cropped-before.tar.gz",
),
(
2,
"cropped-before.tar.gz",
"LC08_L2SP_218074_20190114_20200829_02_T1-cropped.tar.gz",
),
],
)
def test_figshare_data_repository_versions(version, missing, present):
"""
Test if setting the version in Figshare DOI works as expected
"""
# Use a Figshare repo as example (we won't download files from it since
# they are too big)
doi = f"10.6084/m9.figshare.21665630.v{version}"
url = f"https://doi.org/{doi}/"
figshare = FigshareRepository(doi, url)
filenames = [item["name"] for item in figshare.api_response]
assert present in filenames
assert missing not in filenames
@pytest.mark.network
def test_ftp_downloader(ftpserver):
"Test ftp downloader"
with data_over_ftp(ftpserver, "tiny-data.txt") as url:
with TemporaryDirectory() as local_store:
downloader = FTPDownloader(port=ftpserver.server_port)
outfile = os.path.join(local_store, "tiny-data.txt")
downloader(url, outfile, None)
check_tiny_data(outfile)
@pytest.mark.network
@pytest.mark.skipif(paramiko is None, reason="requires paramiko to run SFTP")
def test_sftp_downloader():
"Test sftp downloader"
with TemporaryDirectory() as local_store:
downloader = SFTPDownloader(username="demo", password="password")
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
outfile = os.path.join(local_store, "pocketftp.png")
downloader(url, outfile, None)
assert os.path.exists(outfile)
@pytest.mark.network
@pytest.mark.skipif(paramiko is None, reason="requires paramiko to run SFTP")
def test_sftp_downloader_fail_if_file_object():
"Downloader should fail when a file object rather than string is passed"
with TemporaryDirectory() as local_store:
downloader = SFTPDownloader(username="demo", password="password")
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
outfile = os.path.join(local_store, "pocketftp.png")
with open(outfile, "wb") as outfile_obj:
with pytest.raises(TypeError):
downloader(url, outfile_obj, None)
@pytest.mark.skipif(paramiko is not None, reason="paramiko must be missing")
def test_sftp_downloader_fail_if_paramiko_missing():
"test must fail if paramiko is not installed"
with pytest.raises(ValueError) as exc:
SFTPDownloader()
assert "'paramiko'" in str(exc.value)
@pytest.mark.skipif(tqdm is not None, reason="tqdm must be missing")
@pytest.mark.parametrize("downloader", [HTTPDownloader, FTPDownloader, SFTPDownloader])
def test_downloader_progressbar_fails(downloader):
"Make sure an error is raised if trying to use progressbar without tqdm"
with pytest.raises(ValueError) as exc:
downloader(progressbar=True)
assert "'tqdm'" in str(exc.value)
@pytest.mark.network
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
@pytest.mark.parametrize(
"url,downloader",
[
(BASEURL, HTTPDownloader),
pytest.param(FIGSHAREURL, DOIDownloader, marks=pytest.mark.figshare),
],
ids=["http", "figshare"],
)
def test_downloader_progressbar(url, downloader, capsys):
"Setup a downloader function that prints a progress bar for fetch"
download = downloader(progressbar=True)
with TemporaryDirectory() as local_store:
fname = "tiny-data.txt"
url = url + fname
outfile = os.path.join(local_store, fname)
download(url, outfile, None)
# Read stderr and make sure the progress bar is printed only when told
captured = capsys.readouterr()
printed = captured.err.split("\r")[-1].strip()
assert len(printed) == 79
if sys.platform == "win32":
progress = "100%|####################"
else:
progress = "100%|████████████████████"
# Bar size is not always the same so can't reliably test the whole bar.
assert printed[:25] == progress
# Check that the downloaded file has the right content
check_tiny_data(outfile)
@pytest.mark.network
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
def test_downloader_progressbar_ftp(capsys, ftpserver):
"Setup an FTP downloader function that prints a progress bar for fetch"
with data_over_ftp(ftpserver, "tiny-data.txt") as url:
download = FTPDownloader(progressbar=True, port=ftpserver.server_port)
with TemporaryDirectory() as local_store:
outfile = os.path.join(local_store, "tiny-data.txt")
download(url, outfile, None)
# Read stderr and make sure the progress bar is printed only when
# told
captured = capsys.readouterr()
printed = captured.err.split("\r")[-1].strip()
assert len(printed) == 79
if sys.platform == "win32":
progress = "100%|####################"
else:
progress = "100%|████████████████████"
# Bar size is not always the same so can't reliably test the whole
# bar.
assert printed[:25] == progress
# Check that the file was actually downloaded
check_tiny_data(outfile)
@pytest.mark.network
@pytest.mark.skipif(tqdm is None, reason="requires tqdm")
@pytest.mark.skipif(paramiko is None, reason="requires paramiko")
def test_downloader_progressbar_sftp(capsys):
"Setup an SFTP downloader function that prints a progress bar for fetch"
downloader = SFTPDownloader(progressbar=True, username="demo", password="password")
with TemporaryDirectory() as local_store:
url = "sftp://test.rebex.net/pub/example/pocketftp.png"
outfile = os.path.join(local_store, "pocketftp.png")
downloader(url, outfile, None)
# Read stderr and make sure the progress bar is printed only when told
captured = capsys.readouterr()
printed = captured.err.split("\r")[-1].strip()
assert len(printed) == 79
if sys.platform == "win32":
progress = "100%|####################"
else:
progress = "100%|████████████████████"
# Bar size is not always the same so can't reliably test the whole bar.
assert printed[:25] == progress
# Check that the file was actually downloaded
assert os.path.exists(outfile)
@pytest.mark.network
def test_downloader_arbitrary_progressbar(capsys):
"Setup a downloader function with an arbitrary progress bar class."
class MinimalProgressDisplay:
"""A minimalist replacement for tqdm.tqdm"""
def __init__(self, total):
self.count = 0
self.total = total
def __repr__(self):
"""represent current completion"""
return str(self.count) + "/" + str(self.total)
def render(self):
"""print self.__repr__ to stderr"""
print(f"\r{self}", file=sys.stderr, end="")
def update(self, i):
"""modify completion and render"""
self.count = i
self.render()
def reset(self):
"""set counter to 0"""
self.count = 0
@staticmethod
def close():
"""print a new empty line"""
print("", file=sys.stderr)
pbar = MinimalProgressDisplay(total=None)
download = HTTPDownloader(progressbar=pbar)
with TemporaryDirectory() as local_store:
fname = "large-data.txt"
url = BASEURL + fname
outfile = os.path.join(local_store, "large-data.txt")
download(url, outfile, None)
# Read stderr and make sure the progress bar is printed only when told
captured = capsys.readouterr()
printed = captured.err.split("\r")[-1].strip()
progress = "336/336"
assert printed == progress
# Check that the downloaded file has the right content
check_large_data(outfile)
class TestZenodoAPISupport:
"""
Test support for different Zenodo APIs
"""
article_id = 123456
doi = f"10.0001/zenodo.{article_id}"
doi_url = f"https://doi.org/{doi}"
file_name = "my-file.zip"
file_url = (
"https://zenodo.org/api/files/513d7033-93a2-4eeb-821c-2fb0bbab0012/my-file.zip"
)
file_checksum = "2942bfabb3d05332b66eb128e0842cff"
legacy_api_response = {
"created": "2021-20-19T08:00:00.000000+00:00",
"modified": "2021-20-19T08:00:00.000000+00:00",
"id": article_id,
"doi": doi,
"doi_url": doi_url,
"files": [
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"key": file_name,
"checksum": f"md5:{file_checksum}",
"links": {
"self": file_url,
},
}
],
}
new_api_response = {
"created": "2021-20-19T08:00:00.000000+00:00",
"modified": "2021-20-19T08:00:00.000000+00:00",
"id": article_id,
"doi": doi,
"doi_url": doi_url,
"files": [
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"filename": file_name,
"checksum": file_checksum,
"links": {
"self": file_url,
},
}
],
}
invalid_api_response = {
"created": "2021-20-19T08:00:00.000000+00:00",
"modified": "2021-20-19T08:00:00.000000+00:00",
"id": article_id,
"doi": doi,
"doi_url": doi_url,
"files": [
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"filename": file_name,
"checksum": file_checksum,
"links": {
"self": file_url,
},
},
{
"id": "513d7033-93a2-4eeb-821c-2fb0bbab0012",
"key": file_name,
"checksum": f"md5:{file_checksum}",
"links": {
"self": file_url,
},
},
],
}
@pytest.mark.parametrize(
"api_version, api_response",
[
("legacy", legacy_api_response),
("new", new_api_response),
("invalid", invalid_api_response),
],
)
def test_api_version(self, httpserver, api_version, api_response):
"""
Test if the API version is correctly detected.
"""
# Create a local http server
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
api_response
)
# Create Zenodo downloader
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
# Override base url for the API of the downloader
downloader.base_api_url = httpserver.url_for("")
# Check if the API version is correctly identified
if api_version != "invalid":
assert downloader.api_version == api_version
else:
msg = "Couldn't determine the version of the Zenodo API"
with pytest.raises(ValueError, match=msg):
api_version = downloader.api_version
@pytest.mark.parametrize(
"api_version, api_response",
[("legacy", legacy_api_response), ("new", new_api_response)],
)
def test_download_url(self, httpserver, api_version, api_response):
"""
Test if the download url is correct for each API version.
"""
# Create a local http server
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
api_response
)
# Create Zenodo downloader
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
# Override base url for the API of the downloader
downloader.base_api_url = httpserver.url_for("")
# Check if the download url is correct
download_url = downloader.download_url(file_name=self.file_name)
if api_version == "legacy":
assert download_url == self.file_url
else:
expected_url = (
"https://zenodo.org/records/"
f"{self.article_id}/files/{self.file_name}?download=1"
)
assert download_url == expected_url
@pytest.mark.parametrize(
"api_response",
[legacy_api_response, new_api_response],
)
def test_populate_registry(self, httpserver, tmp_path, api_response):
"""
Test if population of registry is correctly done for each API version.
"""
# Create a local http server
httpserver.expect_request(f"/zenodo.{self.article_id}").respond_with_json(
api_response
)
# Create sample pooch object
puppy = Pooch(base_url="", path=tmp_path)
# Create Zenodo downloader
downloader = ZenodoRepository(doi=self.doi, archive_url=self.doi_url)
# Override base url for the API of the downloader
downloader.base_api_url = httpserver.url_for("")
# Populate registry
downloader.populate_registry(puppy)
assert puppy.registry == {self.file_name: f"md5:{self.file_checksum}"}
class TestDOIDownloaderHeaders:
"""Test the headers argument in DOIDownloader."""
def test_default_headers(self):
"""Test the default value for headers."""
downloader = DOIDownloader()
assert downloader.headers == REQUESTS_HEADERS
downloader = DOIDownloader(headers=None)
assert downloader.headers == REQUESTS_HEADERS
def test_overwrite_headers(self):
"""Test overwriting for headers."""
downloader = DOIDownloader(headers={"custom": "field"})
expected_headers = {
"custom": "field",
}
assert downloader.headers == expected_headers
def test_headers_empty_dict(self):
"""Test passing an emtpy dict to headers."""
downloader = DOIDownloader(headers={})
assert downloader.headers == {}

View File

@@ -0,0 +1,204 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
# pylint: disable=redefined-outer-name
"""
Test the hash calculation and checking functions.
"""
import os
from pathlib import Path
from tempfile import NamedTemporaryFile
import pytest
try:
import xxhash
XXHASH_MAJOR_VERSION = int(xxhash.VERSION.split(".", maxsplit=1)[0])
except ImportError:
xxhash = None # type: ignore[assignment]
XXHASH_MAJOR_VERSION = 0
from ..core import Pooch
from ..hashes import (
make_registry,
file_hash,
hash_matches,
)
from .utils import check_tiny_data, mirror_directory
DATA_DIR = str(Path(__file__).parent / "data" / "store")
REGISTRY = (
"tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d\n"
)
REGISTRY_RECURSIVE = (
"subdir/tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d\n"
"tiny-data.txt baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d\n"
)
TINY_DATA_HASHES_HASHLIB = {
"sha1": "c03148994acd89317915ea2f2d080d6dd127aa09",
"sha256": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
"md5": "70e2afd3fd7e336ae478b1e740a5f08e",
}
TINY_DATA_HASHES_XXH = {
"xxh64": "f843815fe57948fa",
"xxh32": "98d6f1a2",
# Require xxHash > 2.0
"xxh128": "0267d220db258fffb0c567c0ecd1b689",
"xxh3_128": "0267d220db258fffb0c567c0ecd1b689",
"xxh3_64": "811e3f2a12aec53f",
}
TINY_DATA_HASHES = TINY_DATA_HASHES_HASHLIB.copy()
TINY_DATA_HASHES.update(TINY_DATA_HASHES_XXH)
@pytest.fixture
def data_dir_mirror(tmp_path):
"""
Mirror the test data folder on a temporary directory. Needed to avoid
permission errors when pooch is installed on a non-writable path.
"""
return mirror_directory(DATA_DIR, tmp_path)
def test_make_registry(data_dir_mirror):
"Check that the registry builder creates the right file names and hashes"
outfile = NamedTemporaryFile(delete=False) # pylint: disable=consider-using-with
# Need to close the file before writing to it.
outfile.close()
try:
make_registry(data_dir_mirror, outfile.name, recursive=False)
with open(outfile.name, encoding="utf-8") as fout:
registry = fout.read()
assert registry == REGISTRY
# Check that the registry can be used.
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry={})
pup.load_registry(outfile.name)
true = str(data_dir_mirror / "tiny-data.txt")
fname = pup.fetch("tiny-data.txt")
assert true == fname
check_tiny_data(fname)
finally:
os.remove(outfile.name)
def test_make_registry_recursive(data_dir_mirror):
"Check that the registry builder works in recursive mode"
outfile = NamedTemporaryFile(delete=False) # pylint: disable=consider-using-with
# Need to close the file before writing to it.
outfile.close()
try:
make_registry(data_dir_mirror, outfile.name, recursive=True)
with open(outfile.name, encoding="utf-8") as fout:
registry = fout.read()
assert registry == REGISTRY_RECURSIVE
# Check that the registry can be used.
pup = Pooch(path=data_dir_mirror, base_url="some bogus URL", registry={})
pup.load_registry(outfile.name)
assert str(data_dir_mirror / "tiny-data.txt") == pup.fetch("tiny-data.txt")
check_tiny_data(pup.fetch("tiny-data.txt"))
true = str(data_dir_mirror / "subdir" / "tiny-data.txt")
assert true == pup.fetch("subdir/tiny-data.txt")
check_tiny_data(pup.fetch("subdir/tiny-data.txt"))
finally:
os.remove(outfile.name)
def test_file_hash_invalid_algorithm():
"Test an invalid hashing algorithm"
with pytest.raises(ValueError) as exc:
file_hash(fname="something", alg="blah")
assert "'blah'" in str(exc.value)
@pytest.mark.parametrize(
"alg,expected_hash",
list(TINY_DATA_HASHES.items()),
ids=list(TINY_DATA_HASHES.keys()),
)
def test_file_hash(alg, expected_hash):
"Test the hash calculation using hashlib and xxhash"
if alg.startswith("xxh"):
if xxhash is None:
pytest.skip("requires xxhash")
if alg not in ["xxh64", "xxh32"] and XXHASH_MAJOR_VERSION < 2:
pytest.skip("requires xxhash > 2.0")
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
returned_hash = file_hash(fname, alg)
assert returned_hash == expected_hash
@pytest.mark.parametrize(
"alg,expected_hash",
list(TINY_DATA_HASHES.items()),
ids=list(TINY_DATA_HASHES.keys()),
)
def test_hash_matches(alg, expected_hash):
"Make sure the hash checking function works"
if alg.startswith("xxh"):
if xxhash is None:
pytest.skip("requires xxhash")
if alg not in ["xxh64", "xxh32"] and XXHASH_MAJOR_VERSION < 2:
pytest.skip("requires xxhash > 2.0")
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
# Check if the check passes
known_hash = f"{alg}:{expected_hash}"
assert hash_matches(fname, known_hash)
# And also if it fails
known_hash = f"{alg}:blablablabla"
assert not hash_matches(fname, known_hash)
@pytest.mark.parametrize(
"alg,expected_hash",
list(TINY_DATA_HASHES_HASHLIB.items()),
ids=list(TINY_DATA_HASHES_HASHLIB.keys()),
)
def test_hash_matches_strict(alg, expected_hash):
"Make sure the hash checking function raises an exception if strict"
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
# Check if the check passes
known_hash = f"{alg}:{expected_hash}"
assert hash_matches(fname, known_hash, strict=True)
# And also if it fails
bad_hash = f"{alg}:blablablabla"
with pytest.raises(ValueError) as error:
hash_matches(fname, bad_hash, strict=True, source="Neverland")
assert "Neverland" in str(error.value)
with pytest.raises(ValueError) as error:
hash_matches(fname, bad_hash, strict=True, source=None)
assert fname in str(error.value)
def test_hash_matches_none():
"The hash checking function should always returns True if known_hash=None"
fname = os.path.join(DATA_DIR, "tiny-data.txt")
assert hash_matches(fname, known_hash=None)
# Should work even if the file is invalid
assert hash_matches(fname="", known_hash=None)
# strict should cause an error if this wasn't working
assert hash_matches(fname, known_hash=None, strict=True)
@pytest.mark.parametrize(
"alg,expected_hash",
list(TINY_DATA_HASHES_HASHLIB.items()),
ids=list(TINY_DATA_HASHES_HASHLIB.keys()),
)
def test_hash_matches_uppercase(alg, expected_hash):
"Hash matching should be independent of upper or lower case"
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
# Check if the check passes
known_hash = f"{alg}:{expected_hash.upper()}"
assert hash_matches(fname, known_hash, strict=True)
# And also if it fails
with pytest.raises(ValueError) as error:
hash_matches(fname, known_hash[:-5], strict=True, source="Neverland")
assert "Neverland" in str(error.value)

View File

@@ -0,0 +1,49 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
# pylint: disable=redefined-outer-name
"""
Test the entire process of creating a Pooch and using it.
"""
import os
import shutil
from pathlib import Path
import pytest
from .. import create, os_cache
from .. import __version__ as full_version
from .utils import check_tiny_data, capture_log
@pytest.mark.network
def test_create_and_fetch():
"Fetch a data file from the local storage"
path = os_cache("pooch-testing")
if path.exists():
shutil.rmtree(str(path))
pup = create(
path=path,
base_url="https://github.com/fatiando/pooch/raw/{version}/data/",
version=full_version,
version_dev="main",
env="POOCH_DATA_DIR",
)
# Make sure the storage isn't created until a download is required
assert not pup.abspath.exists()
pup.load_registry(Path(os.path.dirname(__file__), "data", "registry.txt"))
for target in ["tiny-data.txt", "subdir/tiny-data.txt"]:
with capture_log() as log_file:
fname = pup.fetch(target)
assert log_file.getvalue().split()[0] == "Downloading"
check_tiny_data(fname)
# Now modify the file to trigger an update on the next fetch
with open(fname, "w", encoding="utf-8") as fin:
fin.write("The data is now different")
with capture_log() as log_file:
fname = pup.fetch(target)
assert log_file.getvalue().split()[0] == "Updating"
check_tiny_data(fname)

View File

@@ -0,0 +1,289 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Test the processor hooks
"""
from pathlib import Path
from tempfile import TemporaryDirectory
import warnings
import pytest
from .. import Pooch
from ..processors import Unzip, Untar, Decompress
from .utils import pooch_test_url, pooch_test_registry, check_tiny_data, capture_log
REGISTRY = pooch_test_registry()
BASEURL = pooch_test_url()
@pytest.mark.network
@pytest.mark.parametrize(
"method,ext,name",
[
("auto", "xz", None),
("lzma", "xz", None),
("xz", "xz", None),
("bzip2", "bz2", None),
("gzip", "gz", None),
("gzip", "gz", "different-name.txt"),
],
ids=["auto", "lzma", "xz", "bz2", "gz", "name"],
)
def test_decompress(method, ext, name):
"Check that decompression after download works for all formats"
processor = Decompress(method=method, name=name)
with TemporaryDirectory() as local_store:
path = Path(local_store)
if name is None:
true_path = str(path / ".".join(["tiny-data.txt", ext, "decomp"]))
else:
true_path = str(path / name)
# Setup a pooch in a temp dir
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
# Check the logs when downloading and from the processor
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt." + ext, processor=processor)
logs = log_file.getvalue()
lines = logs.splitlines()
assert len(lines) == 2
assert lines[0].split()[0] == "Downloading"
assert lines[-1].startswith("Decompressing")
assert method in lines[-1]
assert fname == true_path
check_tiny_data(fname)
# Check that processor doesn't execute when not downloading
with capture_log() as log_file:
fname = pup.fetch("tiny-data.txt." + ext, processor=processor)
assert log_file.getvalue() == ""
assert fname == true_path
check_tiny_data(fname)
@pytest.mark.network
def test_decompress_fails():
"Should fail if method='auto' and no extension is given in the file name"
with TemporaryDirectory() as local_store:
path = Path(local_store)
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
# Invalid extension
with pytest.raises(ValueError) as exception:
with warnings.catch_warnings():
pup.fetch("tiny-data.txt", processor=Decompress(method="auto"))
assert exception.value.args[0].startswith("Unrecognized file extension '.txt'")
assert "pooch.Unzip/Untar" not in exception.value.args[0]
# Should also fail for a bad method name
with pytest.raises(ValueError) as exception:
with warnings.catch_warnings():
pup.fetch("tiny-data.txt", processor=Decompress(method="bla"))
assert exception.value.args[0].startswith("Invalid compression method 'bla'")
assert "pooch.Unzip/Untar" not in exception.value.args[0]
# Point people to Untar and Unzip
with pytest.raises(ValueError) as exception:
with warnings.catch_warnings():
pup.fetch("tiny-data.txt", processor=Decompress(method="zip"))
assert exception.value.args[0].startswith("Invalid compression method 'zip'")
assert "pooch.Unzip/Untar" in exception.value.args[0]
with pytest.raises(ValueError) as exception:
with warnings.catch_warnings():
pup.fetch("store.zip", processor=Decompress(method="auto"))
assert exception.value.args[0].startswith("Unrecognized file extension '.zip'")
assert "pooch.Unzip/Untar" in exception.value.args[0]
@pytest.mark.network
@pytest.mark.parametrize(
"target_path", [None, "some_custom_path"], ids=["default_path", "custom_path"]
)
@pytest.mark.parametrize(
"archive,members",
[
("tiny-data", ["tiny-data.txt"]),
("store", None),
("store", ["store/tiny-data.txt"]),
("store", ["store/subdir/tiny-data.txt"]),
("store", ["store/subdir"]),
("store", ["store/tiny-data.txt", "store/subdir"]),
],
ids=[
"single_file",
"archive_all",
"archive_file",
"archive_subdir_file",
"archive_subdir",
"archive_multiple",
],
)
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
ids=["Unzip", "Untar"],
)
def test_unpacking(processor_class, extension, target_path, archive, members):
"Tests the behaviour of processors for unpacking archives (Untar, Unzip)"
processor = processor_class(members=members, extract_dir=target_path)
if target_path is None:
target_path = archive + extension + processor.suffix
with TemporaryDirectory() as path:
path = Path(path)
true_paths, expected_log = _unpacking_expected_paths_and_logs(
archive, members, path / target_path, processor_class.__name__
)
# Setup a pooch in a temp dir
pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
# Capture logs and check for the right processor message
with capture_log() as log_file:
fnames = pup.fetch(archive + extension, processor=processor)
assert set(fnames) == true_paths
_check_logs(log_file, expected_log)
for fname in fnames:
check_tiny_data(fname)
# Check that processor doesn't execute when not downloading
with capture_log() as log_file:
fnames = pup.fetch(archive + extension, processor=processor)
assert set(fnames) == true_paths
_check_logs(log_file, [])
for fname in fnames:
check_tiny_data(fname)
@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_multiple_unpacking(processor_class, extension):
"Test that multiple subsequent calls to a processor yield correct results"
with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
# Do a first fetch with the one member only
processor1 = processor_class(members=["store/tiny-data.txt"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 1
check_tiny_data(filenames1[0])
# Do a second fetch with the other member
processor2 = processor_class(
members=["store/tiny-data.txt", "store/subdir/tiny-data.txt"]
)
filenames2 = pup.fetch("store" + extension, processor=processor2)
assert len(filenames2) == 2
check_tiny_data(filenames2[0])
check_tiny_data(filenames2[1])
# Do a third fetch, again with one member and assert
# that only this member was returned
filenames3 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames3) == 1
check_tiny_data(filenames3[0])
@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_unpack_members_with_leading_dot(processor_class, extension):
"Test that unpack members can also be specifed both with a leading ./"
with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
# Do a first fetch with the one member only
processor1 = processor_class(members=["./store/tiny-data.txt"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 1
check_tiny_data(filenames1[0])
def _check_logs(log_file, expected_lines):
"""
Assert that the lines in the log match the expected ones.
"""
lines = log_file.getvalue().splitlines()
assert len(lines) == len(expected_lines)
for line, expected_line in zip(lines, expected_lines):
assert line.startswith(expected_line)
def _unpacking_expected_paths_and_logs(archive, members, path, name):
"""
Generate the appropriate expected paths and log message depending on the
parameters for the test.
"""
log_lines = ["Downloading"]
if archive == "tiny-data":
true_paths = {str(path / "tiny-data.txt")}
log_lines.append("Extracting 'tiny-data.txt'")
elif archive == "store" and members is None:
true_paths = {
str(path / "store" / "tiny-data.txt"),
str(path / "store" / "subdir" / "tiny-data.txt"),
}
log_lines.append(f"{name}{name[-1]}ing contents")
elif archive == "store" and members is not None:
true_paths = []
for member in members:
true_path = path / Path(*member.split("/"))
if not str(true_path).endswith("tiny-data.txt"):
true_path = true_path / "tiny-data.txt"
true_paths.append(str(true_path))
log_lines.append(f"Extracting '{member}'")
true_paths = set(true_paths)
return true_paths, log_lines
@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_unpacking_members_then_no_members(processor_class, extension):
"""
Test that calling with valid members then without them works.
https://github.com/fatiando/pooch/issues/364
"""
with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
# Do a first fetch with an existing member
processor1 = processor_class(members=["store/tiny-data.txt"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 1
# Do a second fetch with no members
processor2 = processor_class()
filenames2 = pup.fetch("store" + extension, processor=processor2)
assert len(filenames2) > 1
@pytest.mark.network
@pytest.mark.parametrize(
"processor_class,extension",
[(Unzip, ".zip"), (Untar, ".tar.gz")],
)
def test_unpacking_wrong_members_then_no_members(processor_class, extension):
"""
Test that calling with invalid members then without them works.
https://github.com/fatiando/pooch/issues/364
"""
with TemporaryDirectory() as local_store:
pup = Pooch(path=Path(local_store), base_url=BASEURL, registry=REGISTRY)
# Do a first fetch with incorrect member
processor1 = processor_class(members=["not-a-valid-file.csv"])
filenames1 = pup.fetch("store" + extension, processor=processor1)
assert len(filenames1) == 0
# Do a second fetch with no members
processor2 = processor_class()
filenames2 = pup.fetch("store" + extension, processor=processor2)
assert len(filenames2) > 0

View File

@@ -0,0 +1,197 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Test the utility functions.
"""
import os
import shutil
import time
from pathlib import Path
import tempfile
from tempfile import TemporaryDirectory
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import pytest
from ..utils import (
parse_url,
make_local_storage,
temporary_file,
unique_file_name,
)
def test_unique_name_long():
"The file name should never be longer than 255 characters"
url = f"https://www.something.com/data{'a' * 500}.txt"
assert len(url) > 255
fname = unique_file_name(url)
assert len(fname) == 255
assert fname[-10:] == "aaaaaa.txt"
assert fname.split("-")[1][:10] == "aaaaaaaaaa"
@pytest.mark.parametrize(
"pool",
[ThreadPoolExecutor, ProcessPoolExecutor],
ids=["threads", "processes"],
)
def test_make_local_storage_parallel(pool, monkeypatch):
"Try to create the cache folder in parallel"
# Can cause multiple attempts at creating the folder which leads to an
# exception. Check that this doesn't happen.
# See https://github.com/fatiando/pooch/issues/170
# Monkey path makedirs to make it delay before creating the directory.
# Otherwise, the dispatch is too fast and the directory will exist before
# another process tries to create it.
# Need to keep a reference to the original function to avoid infinite
# recursions from the monkey patching.
makedirs = os.makedirs
def mockmakedirs(path, exist_ok=False): # pylint: disable=unused-argument
"Delay before calling makedirs"
time.sleep(1.5)
makedirs(path, exist_ok=exist_ok)
monkeypatch.setattr(os, "makedirs", mockmakedirs)
data_cache = os.path.join(os.curdir, "test_parallel_cache")
assert not os.path.exists(data_cache)
try:
with pool() as executor:
futures = [
executor.submit(make_local_storage, data_cache) for i in range(4)
]
for future in futures:
future.result()
assert os.path.exists(data_cache)
finally:
if os.path.exists(data_cache):
shutil.rmtree(data_cache)
def test_local_storage_makedirs_permissionerror(monkeypatch):
"Should warn the user when can't create the local data dir"
def mockmakedirs(path, exist_ok=False): # pylint: disable=unused-argument
"Raise an exception to mimic permission issues"
raise PermissionError("Fake error")
data_cache = os.path.join(os.curdir, "test_permission")
assert not os.path.exists(data_cache)
monkeypatch.setattr(os, "makedirs", mockmakedirs)
with pytest.raises(PermissionError) as error:
make_local_storage(
path=data_cache,
env="SOME_VARIABLE",
)
assert "Pooch could not create data cache" in str(error)
assert "'SOME_VARIABLE'" in str(error)
def test_local_storage_newfile_permissionerror(monkeypatch):
"Should warn the user when can't write to the local data dir"
# This is a separate function because there should be a warning if the data
# dir already exists but we can't write to it.
def mocktempfile(**kwargs): # pylint: disable=unused-argument
"Raise an exception to mimic permission issues"
raise PermissionError("Fake error")
with TemporaryDirectory() as data_cache:
os.makedirs(os.path.join(data_cache, "1.0"))
assert os.path.exists(data_cache)
monkeypatch.setattr(tempfile, "NamedTemporaryFile", mocktempfile)
with pytest.raises(PermissionError) as error:
make_local_storage(
path=data_cache,
env="SOME_VARIABLE",
)
assert "Pooch could not write to data cache" in str(error)
assert "'SOME_VARIABLE'" in str(error)
@pytest.mark.parametrize(
"url,output",
[
(
"http://127.0.0.1:8080/test.nc",
{"protocol": "http", "netloc": "127.0.0.1:8080", "path": "/test.nc"},
),
(
"ftp://127.0.0.1:8080/test.nc",
{"protocol": "ftp", "netloc": "127.0.0.1:8080", "path": "/test.nc"},
),
(
"doi:10.6084/m9.figshare.923450.v1/dike.json",
{
"protocol": "doi",
"netloc": "10.6084/m9.figshare.923450.v1",
"path": "/dike.json",
},
),
(
r"doi:10.5281/zenodo.7632643/santisoler/pooch-test-data-v1.zip",
{
"protocol": "doi",
"netloc": "10.5281/zenodo.7632643",
"path": "/santisoler/pooch-test-data-v1.zip",
},
),
],
ids=["http", "ftp", "doi", "zenodo-doi-with-slash"],
)
def test_parse_url(url, output):
"Parse URL into 3 components"
assert parse_url(url) == output
def test_parse_url_invalid_doi():
"Should fail if we forget to not include // in the DOI link"
with pytest.raises(ValueError):
parse_url("doi://XXX/XXX/fname.txt")
def test_temporary_file():
"Make sure the file is writable and cleaned up in the end"
with temporary_file() as tmp:
assert Path(tmp).exists()
with open(tmp, "w", encoding="utf-8") as outfile:
outfile.write("Meh")
with open(tmp, encoding="utf-8") as infile:
assert infile.read().strip() == "Meh"
assert not Path(tmp).exists()
def test_temporary_file_path():
"Make sure the file is writable and cleaned up in the end when given a dir"
with TemporaryDirectory() as path:
with temporary_file(path) as tmp:
assert Path(tmp).exists()
assert path in tmp
with open(tmp, "w", encoding="utf-8") as outfile:
outfile.write("Meh")
with open(tmp, encoding="utf-8") as infile:
assert infile.read().strip() == "Meh"
assert not Path(tmp).exists()
def test_temporary_file_exception():
"Make sure the file is writable and cleaned up when there is an exception"
try:
with temporary_file() as tmp:
assert Path(tmp).exists()
raise ValueError("Nooooooooo!")
except ValueError:
assert not Path(tmp).exists()

View File

@@ -0,0 +1,19 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Test the version.
"""
from packaging.version import Version
import pooch
def test_version():
"Check there's a usable version number in the usual __version__"
assert pooch.__version__.startswith("v")
# Check that it's PEP440 compliant (will raise an exception otherwise)
Version(pooch.__version__)

View File

@@ -0,0 +1,237 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Utilities for testing code.
"""
import os
import io
import logging
import shutil
import stat
from pathlib import Path
from contextlib import contextmanager
from .. import __version__ as full_version
from ..utils import check_version, get_logger
def check_tiny_data(fname):
"""
Load the tiny-data.txt file and check that the contents are correct.
"""
assert os.path.exists(fname)
with open(fname, encoding="utf-8") as tinydata:
content = tinydata.read()
true_content = "\n".join(
["# A tiny data file for test purposes only", "1 2 3 4 5 6"]
)
assert content.strip() == true_content
def check_large_data(fname):
"""
Load the large-data.txt file and check that the contents are correct.
"""
assert os.path.exists(fname)
with open(fname, encoding="utf-8") as data:
content = data.read()
true_content = ["# A larer data file for test purposes only"]
true_content.extend(["1 2 3 4 5 6"] * 6002)
assert content.strip() == "\n".join(true_content)
def pooch_test_url():
"""
Get the base URL for the test data used in Pooch itself.
The URL is a GitHub raw link to the ``pooch/tests/data`` directory from the
`GitHub repository <https://github.com/fatiando/pooch>`__. It matches the
pooch version specified in ``pooch.version.full_version``.
Returns
-------
url
The versioned URL for pooch's test data.
"""
version = check_version(full_version, fallback="main")
url = f"https://github.com/fatiando/pooch/raw/{version}/pooch/tests/data/"
return url
def pooch_test_figshare_url():
"""
Get the base URL for the test data stored in figshare.
The URL contains the DOI for the figshare dataset using the appropriate
version for this version of Pooch.
Returns
-------
url
The URL for pooch's test data.
"""
url = "doi:10.6084/m9.figshare.14763051.v1/"
return url
def pooch_test_zenodo_url():
"""
Get the base URL for the test data stored in Zenodo.
The URL contains the DOI for the Zenodo dataset using the appropriate
version for this version of Pooch.
Returns
-------
url
The URL for pooch's test data.
"""
url = "doi:10.5281/zenodo.4924875/"
return url
def pooch_test_zenodo_with_slash_url():
"""
Get base URL for test data in Zenodo, where the file name contains a slash
The URL contains the DOI for the Zenodo dataset that has a slash in the
filename (created with the GitHub-Zenodo integration service), using the
appropriate version for this version of Pooch.
Returns
-------
url
The URL for pooch's test data.
"""
url = "doi:10.5281/zenodo.7632643/"
return url
def pooch_test_dataverse_url():
"""
Get the base URL for the test data stored on a DataVerse instance.
Returns
-------
url
The URL for pooch's test data.
"""
url = "doi:10.11588/data/TKCFEF/"
return url
def pooch_test_registry():
"""
Get a registry for the test data used in Pooch itself.
Returns
-------
registry
Dictionary with pooch's test data files and their hashes.
"""
registry = {
"tiny-data.txt": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
"large-data.txt": "98de171fb320da82982e6bf0f3994189fff4b42b23328769afce12bdd340444a",
"subdir/tiny-data.txt": "baee0894dba14b12085eacb204284b97e362f4f3e5a5807693cc90ef415c1b2d",
"tiny-data.zip": "0d49e94f07bc1866ec57e7fd1b93a351fba36842ec9b13dd50bf94e8dfa35cbb",
"store.zip": "0498d2a001e71051bbd2acd2346f38da7cbd345a633cb7bf0f8a20938714b51a",
"tiny-data.tar.gz": "41503f083814f43a01a8e9a30c28d7a9fe96839a99727a7fdd0acf7cd5bab63b",
"store.tar.gz": "088c7f4e0f1859b1c769bb6065de24376f366374817ede8691a6ac2e49f29511",
"tiny-data.txt.bz2": "753663687a4040c90c8578061867d1df623e6aa8011c870a5dbd88ee3c82e306",
"tiny-data.txt.gz": "2e2da6161291657617c32192dba95635706af80c6e7335750812907b58fd4b52",
"tiny-data.txt.xz": "99dcb5c32a6e916344bacb4badcbc2f2b6ee196977d1d8187610c21e7e607765",
}
return registry
@contextmanager
def capture_log(level=logging.DEBUG):
"""
Create a context manager for reading from the logs.
Yields
------
log_file : StringIO
a file-like object to which the logs were written
"""
log_file = io.StringIO()
handler = logging.StreamHandler(log_file)
handler.setLevel(level)
get_logger().addHandler(handler)
yield log_file
get_logger().removeHandler(handler)
@contextmanager
def data_over_ftp(server, fname):
"""
Add a test data file to the test FTP server and clean it up afterwards.
Parameters
----------
server
The ``ftpserver`` fixture provided by pytest-localftpserver.
fname : str
The name of a file *relative* to the test data folder of the package
(usually just the file name, not the full path).
Yields
------
url : str
The download URL of the data file from the test FTP server.
"""
package_path = str(Path(__file__).parent / "data" / fname)
server_path = os.path.join(server.anon_root, fname)
try:
shutil.copyfile(package_path, server_path)
url = f"ftp://localhost/{fname}"
yield url
finally:
if os.path.exists(server_path):
os.remove(server_path)
def _recursive_chmod_directories(root, mode):
"""
Recursively change the permissions on the child directories using a bitwise
OR operation.
"""
for item in root.iterdir():
if item.is_dir():
item.chmod(item.stat().st_mode | mode)
_recursive_chmod_directories(item, mode)
def mirror_directory(source, destination):
"""
Copy contents of the source directory into destination and fix permissions.
Parameters
----------
source : str, :class:`pathlib.Path`
Source data directory.
destination : str, :class:`pathlib.Path`
Destination directory that will contain the copy of source. The actual
source directory (not just it's contents) is copied.
Returns
-------
mirror : :class:`pathlib.Path`
The path of the mirrored output directory.
"""
source = Path(source)
mirror = Path(destination) / source.name
shutil.copytree(source, mirror)
_recursive_chmod_directories(mirror, mode=stat.S_IWUSR)
return mirror

View File

@@ -0,0 +1,72 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Custom classes for type annotations
This module provides additional `PEP 484 <https://peps.python.org/pep-0484/>`_
type aliases used in ``pooch``'s codebase.
"""
import os
from typing import (
TYPE_CHECKING,
Any,
Callable,
Literal,
Optional,
Protocol,
TypedDict,
Union,
)
# Import Pooch only if TYPE_CHECKING is true to avoid circular loops at runtime
if TYPE_CHECKING:
from .. import Pooch
__all__ = [
"Action",
"Downloader",
"PathType",
"PathInputType",
"ParsedURL",
"Processor",
]
Action = Literal["download", "fetch", "update"]
PathType = Union[str, os.PathLike]
PathInputType = Union[PathType, list[PathType], tuple[PathType]]
Processor = Callable[[str, Action, Optional["Pooch"]], Any]
class Downloader(Protocol):
"""
Class used to define the type definition for the downloader function.
"""
# pylint: disable=too-few-public-methods
def __call__( # noqa: E704
self,
fname: str,
action: Optional[PathType],
pooch: Optional["Pooch"],
*,
check_only: Optional[bool] = None,
) -> Any: ...
class ParsedURL(TypedDict):
"""
Type for a dictionary generated after parsing a URL.
The dictionary contains three keys: protocol, netloc and path.
"""
protocol: str
netloc: str
path: str

View File

@@ -0,0 +1,356 @@
# Copyright (c) 2018 The Pooch Developers.
# Distributed under the terms of the BSD 3-Clause License.
# SPDX-License-Identifier: BSD-3-Clause
#
# This code is part of the Fatiando a Terra project (https://www.fatiando.org)
#
"""
Misc utilities
"""
import logging
import os
import tempfile
import hashlib
from pathlib import Path
from urllib.parse import urlsplit
from contextlib import contextmanager
import warnings
from typing import Optional, Any, Generator
import platformdirs
from packaging.version import Version
from .typing import ParsedURL, PathType, PathInputType
LOGGER = logging.Logger("pooch")
LOGGER.addHandler(logging.StreamHandler())
def file_hash(*args, **kwargs) -> Any:
"""
WARNING: Importing this function from pooch.utils is DEPRECATED.
Please import from the top-level namespace (`from pooch import file_hash`)
instead, which is fully backwards compatible with pooch >= 0.1.
Examples
--------
>>> fname = "test-file-for-hash.txt"
>>> with open(fname, "w") as f:
... __ = f.write("content of the file")
>>> print(file_hash(fname))
0fc74468e6a9a829f103d069aeb2bb4f8646bad58bf146bb0e3379b759ec4a00
>>> import os
>>> os.remove(fname)
"""
# pylint: disable=import-outside-toplevel
from .hashes import file_hash as new_file_hash
message = """
Importing file_hash from pooch.utils is DEPRECATED. Please import from the
top-level namespace (`from pooch import file_hash`) instead, which is fully
backwards compatible with pooch >= 0.1.
"""
warnings.warn(message, DeprecationWarning, stacklevel=2)
return new_file_hash(*args, **kwargs)
def get_logger() -> logging.Logger:
r"""
Get the default event logger.
The logger records events like downloading files, unzipping archives, etc.
Use the method :meth:`logging.Logger.setLevel` of this object to adjust the
verbosity level from Pooch.
Returns
-------
logger : :class:`logging.Logger`
The logger object for Pooch
"""
return LOGGER
def os_cache(project: str) -> Path:
r"""
Default cache location based on the operating system.
The folder locations are defined by the ``platformdirs`` package
using the ``user_cache_dir`` function.
Usually, the locations will be following (see the
`platformdirs documentation <https://platformdirs.readthedocs.io>`__):
* Mac: ``~/Library/Caches/<AppName>``
* Unix: ``~/.cache/<AppName>`` or the value of the ``XDG_CACHE_HOME``
environment variable, if defined.
* Windows: ``C:\Users\<user>\AppData\Local\<AppAuthor>\<AppName>\Cache``
Parameters
----------
project : str
The project name.
Returns
-------
cache_path : :class:`pathlib.Path`
The default location for the data cache. User directories (``'~'``) are
not expanded.
"""
return Path(platformdirs.user_cache_dir(project))
def check_version(version: str, fallback: str = "master") -> str:
"""
Check if a version is PEP440 compliant and there are no unreleased changes.
For example, ``version = "0.1"`` will be returned as is but ``version =
"0.1+10.8dl8dh9"`` will return the fallback. This is the convention used by
`versioneer <https://github.com/warner/python-versioneer>`__ to mark that
this version is 10 commits ahead of the last release.
Parameters
----------
version : str
A version string.
fallback : str
What to return if the version string has unreleased changes.
Returns
-------
version : str
If *version* is PEP440 compliant and there are unreleased changes, then
return *version*. Otherwise, return *fallback*.
Raises
------
InvalidVersion
If *version* is not PEP440 compliant.
Examples
--------
>>> check_version("0.1")
'0.1'
>>> check_version("0.1a10")
'0.1a10'
>>> check_version("0.1+111.9hdg36")
'master'
>>> check_version("0.1+111.9hdg36", fallback="dev")
'dev'
"""
parse = Version(version)
if parse.local is not None:
return fallback
return version
def parse_url(url: str) -> ParsedURL:
"""
Parse a URL into 3 components:
<protocol>://<netloc>/<path>
Example URLs:
* http://127.0.0.1:8080/test.nc
* ftp://127.0.0.1:8080/test.nc
* doi:10.6084/m9.figshare.923450.v1/test.nc
The DOI is a special case. The protocol will be "doi", the netloc will be
the DOI, and the path is what comes after the last "/".
The only exception are Zenodo dois: the protocol will be "doi", the netloc
will be composed by the "prefix/suffix" and the path is what comes after
the second "/". This allows to support special cases of Zenodo dois where
the path contains forward slashes "/", created by the GitHub-Zenodo
integration service.
Parameters
----------
url : str
The URL.
Returns
-------
parsed_url : dict
Three components of a URL (e.g.,
``{'protocol':'http', 'netloc':'127.0.0.1:8080','path': '/test.nc'}``).
"""
if url.startswith("doi://"):
raise ValueError(
f"Invalid DOI link '{url}'. You must not use '//' after 'doi:'."
)
if url.startswith("doi:"):
protocol = "doi"
parts = url[4:].split("/")
if "zenodo" in parts[1].lower():
netloc = "/".join(parts[:2])
path = "/" + "/".join(parts[2:])
else:
netloc = "/".join(parts[:-1])
path = "/" + parts[-1]
else:
parsed_url = urlsplit(url)
protocol = parsed_url.scheme or "file"
netloc = parsed_url.netloc
path = parsed_url.path
return {"protocol": protocol, "netloc": netloc, "path": path}
def cache_location(
path: PathInputType, env: Optional[str] = None, version: Optional[str] = None
) -> Path:
"""
Location of the cache given a base path and optional configuration.
Checks for the environment variable to overwrite the path of the local
cache. Optionally add *version* to the path if given.
Parameters
----------
path : str, PathLike, list or tuple
The path to the local data storage folder. If this is a list or tuple,
we'll join the parts with the appropriate separator. Use
:func:`pooch.os_cache` for a sensible default.
version : str or None
The version string for your project. Will be appended to given path if
not None.
env : str or None
An environment variable that can be used to overwrite *path*. This
allows users to control where they want the data to be stored. We'll
append *version* to the end of this value as well.
Returns
-------
local_path : PathLike
The path to the local directory.
"""
if env is not None and env in os.environ and os.environ[env]:
path = os.environ[env]
if isinstance(path, (list, tuple)):
path = os.path.join(*path)
if version is not None:
path = os.path.join(str(path), version)
path = os.path.expanduser(str(path))
return Path(path)
def make_local_storage(path: PathType, env: Optional[str] = None) -> None:
"""
Create the local cache directory and make sure it's writable.
Parameters
----------
path : str or PathLike
The path to the local data storage folder.
env : str or None
An environment variable that can be used to overwrite *path*. Only used
in the error message in case the folder is not writable.
"""
path = str(path)
# Check that the data directory is writable
if not os.path.exists(path):
action = "create"
else:
action = "write to"
try:
if action == "create":
# When running in parallel, it's possible that multiple jobs will
# try to create the path at the same time. Use exist_ok to avoid
# raising an error.
os.makedirs(path, exist_ok=True)
else:
with tempfile.NamedTemporaryFile(dir=path):
pass
except PermissionError as error:
message = [
str(error),
f"| Pooch could not {action} data cache folder '{path}'.",
"Will not be able to download data files.",
]
if env is not None:
message.append(
f"Use environment variable '{env}' to specify a different location."
)
raise PermissionError(" ".join(message)) from error
@contextmanager
def temporary_file(path: Optional[PathType] = None) -> Generator[str, None, None]:
"""
Create a closed and named temporary file and make sure it's cleaned up.
Using :class:`tempfile.NamedTemporaryFile` will fail on Windows if trying
to open the file a second time (when passing its name to Pooch function,
for example). This context manager creates the file, closes it, yields the
file path, and makes sure it's deleted in the end.
Parameters
----------
path : str or PathLike
The directory in which the temporary file will be created.
Yields
------
fname : str
The path to the temporary file.
"""
tmp = tempfile.NamedTemporaryFile(delete=False, dir=path) # type: ignore
# Close the temp file so that it can be opened elsewhere
tmp.close()
try:
yield tmp.name
finally:
if os.path.exists(tmp.name):
os.remove(tmp.name)
def unique_file_name(url: str) -> str:
"""
Create a unique file name based on the given URL.
The file name will be unique to the URL by prepending the name with the MD5
hash (hex digest) of the URL. The name will also include the last portion
of the URL.
The format will be: ``{md5}-{filename}.{ext}``
The file name will be cropped so that the entire name (including the hash)
is less than 255 characters long (the limit on most file systems).
Parameters
----------
url : str
The URL with a file name at the end.
Returns
-------
fname : str
The file name, unique to this URL.
Examples
--------
>>> print(unique_file_name("https://www.some-server.org/2020/data.txt"))
02ddee027ce5ebb3d7059fb23d210604-data.txt
>>> print(unique_file_name("https://www.some-server.org/2019/data.txt"))
9780092867b497fca6fc87d8308f1025-data.txt
>>> print(unique_file_name("https://www.some-server.org/2020/data.txt.gz"))
181a9d52e908219c2076f55145d6a344-data.txt.gz
"""
md5 = hashlib.md5(url.encode(), usedforsecurity=False).hexdigest()
fname = parse_url(url)["path"].split("/")[-1]
# Crop the start of the file name to fit 255 characters including the hash
# and the :
fname = fname[-(255 - len(md5) - 1) :]
unique_name = f"{md5}-{fname}"
return unique_name