From 4f94896f175a424e0919ce2124eb3db90f00b62d Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Fri, 16 Aug 2024 17:01:30 -0400 Subject: [PATCH 01/21] WIP - Major refactoring of testing setup utilities to use pooch --- xclim/testing/helpers.py | 360 ++++++++++++++++++++++++++++++--------- xclim/testing/utils.py | 247 +-------------------------- 2 files changed, 289 insertions(+), 318 deletions(-) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 34e10823d..d50f23b5c 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -2,22 +2,33 @@ from __future__ import annotations +import importlib.resources as ilr +import logging import os import re +import shutil +import tempfile import time import warnings from datetime import datetime as dt from pathlib import Path from shutil import copytree from sys import platform +from urllib.error import HTTPError import numpy as np import pandas as pd +import pooch import xarray as xr from dask.diagnostics import Callback from filelock import FileLock from packaging.version import Version +try: + from pytest_socket import SocketBlockedError +except ImportError: + SocketBlockedError = None + import xclim from xclim import __version__ as __xclim_version__ from xclim.core import calendar @@ -27,12 +38,30 @@ shortwave_upwelling_radiation_from_net_downwelling, ) from xclim.testing.utils import _default_cache_dir # noqa -from xclim.testing.utils import get_file as _get_file -from xclim.testing.utils import get_local_testdata as _get_local_testdata from xclim.testing.utils import open_dataset as _open_dataset -TESTDATA_BRANCH = os.getenv("XCLIM_TESTDATA_BRANCH", "main") -"""Sets the branch of Ouranosinc/xclim-testdata to use when fetching testing datasets. +TESTDATA_REPO_URL = str( + os.getenv("XCLIM_TESTDATA_REPO_URL", "https://github.com/Ouranosinc/xclim-testdata") +) +"""Sets the URL of the testing data repository to use when fetching datasets. + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" + +or setting the variable at runtime: + +.. code-block:: console + + $ env XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" pytest +""" + +TESTDATA_BRANCH = str(os.getenv("XCLIM_TESTDATA_BRANCH", "main")) +"""Sets the branch of the testing data repository to use when fetching datasets. Notes ----- @@ -47,10 +76,9 @@ .. code-block:: console $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest - """ -PREFETCH_TESTING_DATA = os.getenv("XCLIM_PREFETCH_TESTING_DATA", False) +PREFETCH_TESTING_DATA = bool(os.getenv("XCLIM_PREFETCH_TESTING_DATA")) """Indicates whether the testing data should be downloaded when running tests. Notes @@ -69,16 +97,59 @@ .. code-block:: console $ env XCLIM_PREFETCH_TESTING_DATA=1 pytest +""" + +CACHE_DIR = os.getenv("XCLIM_DATA_DIR", _default_cache_dir) +"""Sets the directory to store the testing datasets. + +If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XCLIM_DATA_DIR="/path/to/my/data" +or setting the variable at runtime: + +.. code-block:: console + + $ env XCLIM_DATA_DIR="/path/to/my/data" pytest """ +DATA_UPDATES = bool(os.getenv("XCLIM_DATA_UPDATES")) +"""Sets whether to allow updates to the testing datasets. + +If set to ``True``, the data files will be downloaded even if the upstream hashes do not match. + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XCLIM_DATA_UPDATES=True + +or setting the variable at runtime: + +.. code-block:: console + + $ env XCLIM_DATA_UPDATES=True pytest +""" + + +DATA_URL = f"{TESTDATA_REPO_URL}/raw/{TESTDATA_BRANCH}/data" + __all__ = [ + "DATA_UPDATES", + "DATA_URL", "PREFETCH_TESTING_DATA", "TESTDATA_BRANCH", "add_example_file_paths", "assert_lazy", "generate_atmos", - "populate_testing_data", "test_timeseries", ] @@ -89,10 +160,9 @@ def testing_setup_warnings(): # This does not need to be emitted on GitHub Workflows and ReadTheDocs if not os.getenv("CI") and not os.getenv("READTHEDOCS"): warnings.warn( - f'`xclim` {__xclim_version__} is running tests against the "main" branch of `Ouranosinc/xclim-testdata`. ' - "It is possible that changes in xclim-testdata may be incompatible with test assertions in this version. " - "Please be sure to check https://github.com/Ouranosinc/xclim-testdata for more information.", - UserWarning, + f'`xclim` {__xclim_version__} is running tests against the "main" branch of the testing data. ' + "It is possible that changes to the testing data may be incompatible with some assertions in this version. " + f"Please be sure to check {TESTDATA_REPO_URL} for more information.", ) if re.match(r"^v\d+\.\d+\.\d+", TESTDATA_BRANCH): @@ -107,13 +177,209 @@ def testing_setup_warnings(): if Version(TESTDATA_BRANCH) > Version(install_calendar_version): warnings.warn( - f"Installation date of `xclim` ({install_date.ctime()}) " - f"predates the last release of `xclim-testdata` ({TESTDATA_BRANCH}). " + f"The installation date of `xclim` ({install_date.ctime()}) predates the last release of testing data ({TESTDATA_BRANCH}). " "It is very likely that the testing data is incompatible with this build of `xclim`.", - UserWarning, ) +def load_registry( + file: str | Path | None = None, remote: str = DATA_URL +) -> dict[str, str]: + """Load the registry file for the test data. + + Parameters + ---------- + file : str or Path, optional + Path to the registry file. If not provided, the registry file found within the package data will be used. + remote : str + URL to the remote registry folder. + + Returns + ------- + dict + Dictionary of filenames and hashes. + """ + + def _fetcher(f: str, r: str, c: str) -> str: + try: + return pooch.retrieve( + url=f"{r}/{f}", + known_hash=None, + path=c, + fname="registry.txt", + ) + except HTTPError: + raise + except SocketBlockedError: + raise + + # Get registry file from package_data + if file is None: + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + if not registry_file.exists(): + registry_file.touch() + try: + with tempfile.TemporaryDirectory() as tempdir: + remote_registry_file = _fetcher(registry_file.name, remote, tempdir) + # Check if the local registry file matches the remote registry + if pooch.file_hash(remote_registry_file) != pooch.file_hash( + registry_file.as_posix() + ): + warnings.warn( + "Local registry file does not match remote registry file." + ) + shutil.move(remote_registry_file, registry_file) + except FileNotFoundError: + warnings.warn( + "Registry file not accessible in remote repository. " + "Aborting file retrieval and using local registry file." + ) + except SocketBlockedError: + warnings.warn( + "Testing suite is being run with `--disable-socket`. Using local registry file." + ) + if not registry_file.exists(): + raise FileNotFoundError( + f"Local registry file not found: {registry_file}. " + "Testing setup cannot proceed without registry file." + ) + else: + registry_file = Path(file) + if not registry_file.exists(): + raise FileNotFoundError(f"Registry file not found: {registry_file}") + + logging.info("Registry file found: %s", registry_file) + + # Load the registry file + registry = dict() + with registry_file.open() as buffer: + for entry in buffer.readlines(): + registry[entry.split()[0]] = entry.split()[1] + + return registry + + +def nimbus( # noqa: PR01 + data_dir: str | Path = CACHE_DIR, + data_updates: bool = DATA_UPDATES, + data_url: str = DATA_URL, +): + """Pooch registry instance for xhydro test data. + + Parameters + ---------- + data_dir : str or Path + Path to the directory where the data files are stored. + data_updates : bool + If True, allow updates to the data files. + data_url : str + Base URL to download the data files. + + Returns + ------- + pooch.Pooch + Pooch instance for the xhydro test data. + + Notes + ----- + There are three environment variables that can be used to control the behaviour of this registry: + - ``XCLIM_DATA_DIR``: If this environment variable is set, it will be used as the base directory to store the data + files. The directory should be an absolute path (i.e., it should start with ``/``). Otherwise, + the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). + - ``XCLIM_DATA_UPDATES``: If this environment variable is set, then the data files will be downloaded even if the + upstream hashes do not match. This is useful if you want to always use the latest version of the data files. + - ``XCLIM_DATA_URL``: If this environment variable is set, it will be used as the base URL to download the data files. + + Examples + -------- + Using the registry to download a file: + + .. code-block:: python + + import xarray as xr + from xclim.testing.helpers import nimbus + + example_file = nimbus().fetch("example.nc") + data = xr.open_dataset(example_file) + """ + return pooch.create( + path=data_dir, + base_url=data_url, + version=__xclim_version__, + version_dev="main", + allow_updates=data_updates, + registry=load_registry(remote=data_url), + ) + + +def populate_testing_data( + registry_file: str | Path | None = None, + temp_folder: Path | None = None, + branch: str = TESTDATA_BRANCH, + _data_url: str = DATA_URL, + _local_cache: Path = _default_cache_dir, +) -> None: + """Populate the local cache with the testing data. + + Parameters + ---------- + registry_file : str or Path, optional + Path to the registry file. If not provided, the registry file from package_data will be used. + temp_folder : Path, optional + Path to a temporary folder to use as the local cache. If not provided, the default location will be used. + branch : str, optional + Branch of hydrologie/xhydro-testdata to use when fetching testing datasets. + _data_url : Path + URL for the testing data. + Set via the `DATA_URL` environment variable ({TESTDATA_REPO_URL}/raw/{TESTDATA_BRANCH}/data). + _local_cache : Path + Path to the local cache. Defaults to the location set by the platformdirs library. + The testing data will be downloaded to this local cache. + + Returns + ------- + None + """ + # Get registry file from package_data or provided path + registry = load_registry(registry_file) + # Set the local cache to the temp folder + if temp_folder is not None: + _local_cache = temp_folder + + # Create the Pooch instance + n = nimbus(data_url=_data_url) + + # Set the branch + n.version_dev = branch + # Set the local cache + n.path = _local_cache + + # Download the files + errored_files = [] + for file in registry.keys(): + try: + n.fetch(file) + except HTTPError: + msg = f"File `{file}` not accessible in remote repository." + logging.error(msg) + errored_files.append(file) + except SocketBlockedError as e: + msg = ( + "Unable to access registry file online. Testing suite is being run with `--disable-socket`. " + "If you intend to run tests with this option enabled, please download the file beforehand with the " + "following console command: `$ xclim prefetch_testing_data`." + ) + raise SocketBlockedError(msg) from e + else: + logging.info("Files were downloaded successfully.") + finally: + if errored_files: + logging.error( + "The following files were unable to be downloaded: %s", + errored_files, + ) + + def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" with _open_dataset( @@ -153,72 +419,6 @@ def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: return namespace -def populate_testing_data( - temp_folder: Path | None = None, - branch: str = TESTDATA_BRANCH, - _local_cache: Path = _default_cache_dir, -): - """Perform `_get_file` or `get_local_dataset` calls to GitHub to download or copy relevant testing data.""" - if _local_cache.joinpath(".data_written").exists(): - # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run - return - - data_entries = [ - "CanESM2_365day/pr_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc", - "ERA5/daily_surface_cancities_1990-1993.nc", - "EnsembleReduce/TestEnsReduceCriteria.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc", - "FWI/GFWED_sample_2017.nc", - "FWI/cffdrs_test_fwi.nc", - "FWI/cffdrs_test_wDC.nc", - "HadGEM2-CC_360day/pr_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc", - "NRCANdaily/nrcan_canada_daily_pr_1990.nc", - "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc", - "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc", - "Raven/q_sim.nc", - "SpatialAnalogs/CanESM2_ScenGen_Chibougamau_2041-2070.nc", - "SpatialAnalogs/NRCAN_SECan_1981-2010.nc", - "SpatialAnalogs/dissimilarity.nc", - "SpatialAnalogs/indicators.nc", - "cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc", - "cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc", - "sdba/CanESM2_1950-2100.nc", - "sdba/ahccd_1950-2013.nc", - "sdba/nrcan_1950-2013.nc", - "uncertainty_partitioning/cmip5_pr_global_mon.nc", - "uncertainty_partitioning/seattle_avg_tas.csv", - ] - - data = dict() - for filepattern in data_entries: - if temp_folder is None: - try: - data[filepattern] = _get_file( - filepattern, branch=branch, cache_dir=_local_cache - ) - except FileNotFoundError: - warnings.warn( - "File {filepattern} was not found. Consider verifying the file exists." - ) - continue - elif temp_folder: - try: - data[filepattern] = _get_local_testdata( - filepattern, - temp_folder=temp_folder, - branch=branch, - _local_cache=_local_cache, - ) - except FileNotFoundError: - warnings.warn("File {filepattern} was not found.") - continue - return - - def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): """Gather testing data across workers.""" if ( diff --git a/xclim/testing/utils.py b/xclim/testing/utils.py index bcc3691f8..dd142429a 100644 --- a/xclim/testing/utils.py +++ b/xclim/testing/utils.py @@ -6,25 +6,20 @@ # Some of this code was copied and adapted from xarray from __future__ import annotations -import hashlib -import json import logging import os import platform import re import sys -import warnings from collections.abc import Sequence from importlib import import_module from io import StringIO from pathlib import Path -from shutil import copy from typing import TextIO from urllib.error import HTTPError, URLError from urllib.parse import urljoin, urlparse -from urllib.request import urlopen, urlretrieve +from urllib.request import urlretrieve -import pandas as pd from platformdirs import user_cache_dir from xarray import Dataset from xarray import open_dataset as _open_dataset @@ -64,9 +59,6 @@ __all__ = [ "_default_cache_dir", "audit_url", - "get_file", - "get_local_testdata", - "list_datasets", "list_input_variables", "open_dataset", "publish_release_notes", @@ -75,13 +67,6 @@ ] -def file_md5_checksum(f_name): - hash_md5 = hashlib.md5() # noqa: S324 - with open(f_name, "rb") as f: - hash_md5.update(f.read()) - return hash_md5.hexdigest() - - def audit_url(url: str, context: str = None) -> str: """Check if the URL is well-formed. @@ -103,212 +88,37 @@ def audit_url(url: str, context: str = None) -> str: return url -def get_file( - name: str | os.PathLike[str] | Sequence[str | os.PathLike[str]], - github_url: str = "https://github.com/Ouranosinc/xclim-testdata", - branch: str = "main", - cache_dir: Path = _default_cache_dir, -) -> Path | list[Path]: - """Return a file from an online GitHub-like repository. - - If a local copy is found then always use that to avoid network traffic. - - Parameters - ---------- - name : str | os.PathLike[str] | Sequence[str | os.PathLike[str]] - Name of the file or list/tuple of names of files containing the dataset(s) including suffixes. - github_url : str - URL to GitHub repository where the data is stored. - branch : str, optional - For GitHub-hosted files, the branch to download from. - cache_dir : Path - The directory in which to search for and write cached data. - - Returns - ------- - Path | list[Path] - """ - if isinstance(name, (str, os.PathLike)): - name = [name] - - files = [] - for n in name: - fullname = Path(n) - suffix = fullname.suffix - files.append( - _get( - fullname=fullname, - github_url=github_url, - branch=branch, - suffix=suffix, - cache_dir=cache_dir, - ) - ) - if len(files) == 1: - return files[0] - return files - - -def get_local_testdata( - patterns: str | Sequence[str], - temp_folder: str | os.PathLike, - branch: str = "master", - _local_cache: str | os.PathLike = _default_cache_dir, -) -> Path | list[Path]: - """Copy specific testdata from a default cache to a temporary folder. - - Return files matching `pattern` in the default cache dir and move to a local temp folder. - - Parameters - ---------- - patterns : str | Sequence[str] - Glob patterns, which must include the folder. - temp_folder : str | os.PathLike - Target folder to copy files and filetree to. - branch : str - For GitHub-hosted files, the branch to download from. - _local_cache : str | os.PathLike - Local cache of testing data. - - Returns - ------- - Path | list[Path] - """ - temp_paths = [] - - if isinstance(patterns, str): - patterns = [patterns] - - for pattern in patterns: - potential_paths = [ - path for path in Path(temp_folder).joinpath(branch).glob(pattern) - ] - if potential_paths: - temp_paths.extend(potential_paths) - continue - - testdata_path = Path(_local_cache) - if not testdata_path.exists(): - raise RuntimeError(f"{testdata_path} does not exists") - paths = [path for path in testdata_path.joinpath(branch).glob(pattern)] - if not paths: - raise FileNotFoundError( - f"No data found for {pattern} at {testdata_path}/{branch}." - ) - - main_folder = Path(temp_folder).joinpath(branch).joinpath(Path(pattern).parent) - main_folder.mkdir(exist_ok=True, parents=True) - - for file in paths: - temp_file = main_folder.joinpath(file.name) - if not temp_file.exists(): - copy(file, main_folder) - temp_paths.append(temp_file) - - # Return item directly when singleton, for convenience - return temp_paths[0] if len(temp_paths) == 1 else temp_paths - - def _get( - fullname: Path, + name: Path, github_url: str, branch: str, - suffix: str, cache_dir: Path, ) -> Path: cache_dir = cache_dir.absolute() - local_file = cache_dir / branch / fullname - md5_name = fullname.with_suffix(f"{suffix}.md5") - md5_file = cache_dir / branch / md5_name + local_file = cache_dir / branch / name if not github_url.startswith("https"): raise ValueError(f"GitHub URL not secure: '{github_url}'.") - if local_file.is_file(): - local_md5 = file_md5_checksum(local_file) - try: - url = "/".join((github_url, "raw", branch, md5_name.as_posix())) - msg = f"Attempting to fetch remote file md5: {md5_name.as_posix()}" - logger.info(msg) - urlretrieve(audit_url(url), md5_file) # noqa: S310 - with open(md5_file) as f: - remote_md5 = f.read() - if local_md5.strip() != remote_md5.strip(): - local_file.unlink() - msg = ( - f"MD5 checksum for {local_file.as_posix()} does not match upstream md5. " - "Attempting new download." - ) - warnings.warn(msg) - except HTTPError: - msg = ( - f"{md5_name.as_posix()} not accessible in remote repository. " - "Unable to determine validity with upstream repo." - ) - warnings.warn(msg) - except URLError: - msg = ( - f"{md5_name.as_posix()} not found in remote repository. " - "Unable to determine validity with upstream repo." - ) - warnings.warn(msg) - except SocketBlockedError: - msg = f"Unable to access {md5_name.as_posix()} online. Testing suite is being run with `--disable-socket`." - warnings.warn(msg) - if not local_file.is_file(): # This will always leave this directory on disk. # We may want to add an option to remove it. local_file.parent.mkdir(exist_ok=True, parents=True) - - url = "/".join((github_url, "raw", branch, fullname.as_posix())) - msg = f"Fetching remote file: {fullname.as_posix()}" + url = "/".join((github_url, "raw", branch, name.as_posix())) + msg = f"Fetching remote file: {name.as_posix()}" logger.info(msg) try: urlretrieve(audit_url(url), local_file) # noqa: S310 except HTTPError as e: - msg = f"{fullname.as_posix()} not accessible in remote repository. Aborting file retrieval." - raise FileNotFoundError(msg) from e - except URLError as e: - msg = ( - f"{fullname.as_posix()} not found in remote repository. " - "Verify filename and repository address. Aborting file retrieval." - ) + msg = f"{name.as_posix()} not accessible in remote repository. Aborting file retrieval." raise FileNotFoundError(msg) from e except SocketBlockedError as e: msg = ( - f"Unable to access {fullname.as_posix()} online. Testing suite is being run with `--disable-socket`. " + f"Unable to access {name.as_posix()} online. Testing suite is being run with `--disable-socket`. " f"If you intend to run tests with this option enabled, please download the file beforehand with the " f"following console command: `xclim prefetch_testing_data`." ) raise FileNotFoundError(msg) from e - try: - url = "/".join((github_url, "raw", branch, md5_name.as_posix())) - msg = f"Fetching remote file md5: {md5_name.as_posix()}" - logger.info(msg) - urlretrieve(audit_url(url), md5_file) # noqa: S310 - except (HTTPError, URLError) as e: - msg = ( - f"{md5_name.as_posix()} not accessible online. " - "Unable to determine validity of file from upstream repo. " - "Aborting file retrieval." - ) - local_file.unlink() - raise FileNotFoundError(msg) from e - - local_md5 = file_md5_checksum(local_file) - try: - with open(md5_file) as f: - remote_md5 = f.read() - if local_md5.strip() != remote_md5.strip(): - local_file.unlink() - msg = ( - f"{local_file.as_posix()} and md5 checksum do not match. " - "There may be an issue with the upstream origin data." - ) - raise OSError(msg) - except OSError as e: - logger.error(e) return local_file @@ -316,9 +126,8 @@ def _get( # idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn def open_dataset( name: str | os.PathLike[str], - suffix: str | None = None, dap_url: str | None = None, - github_url: str = "https://github.com/Ouranosinc/xclim-testdata", + github_url: str = "https://github.com/Ouranosinc/xclim-testdata/data", branch: str = "main", cache: bool = True, cache_dir: Path = _default_cache_dir, @@ -332,8 +141,6 @@ def open_dataset( ---------- name : str or os.PathLike Name of the file containing the dataset. - suffix : str, optional - If no suffix is given, assumed to be netCDF ('.nc' is appended). For no suffix, set "". dap_url : str, optional URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. github_url : str @@ -357,9 +164,6 @@ def open_dataset( """ if isinstance(name, (str, os.PathLike)): name = Path(name) - if suffix is None: - suffix = ".nc" - fullname = name.with_suffix(suffix) if dap_url is not None: dap_file_address = urljoin(dap_url, str(name)) @@ -374,10 +178,9 @@ def open_dataset( raise OSError(msg) local_file = _get( - fullname=fullname, + name=name, github_url=github_url, branch=branch, - suffix=suffix, cache_dir=cache_dir, ) @@ -391,38 +194,6 @@ def open_dataset( raise err -def list_datasets(github_repo="Ouranosinc/xclim-testdata", branch="main"): - """Return a DataFrame listing all xclim test datasets available on the GitHub repo for the given branch. - - The result includes the filepath, as passed to `open_dataset`, the file size (in KB) and the html url to the file. - This uses an unauthenticated call to GitHub's REST API, so it is limited to 60 requests per hour (per IP). - A single call of this function triggers one request per subdirectory, so use with parsimony. - """ - with urlopen( # noqa: S310 - audit_url(f"https://api.github.com/repos/{github_repo}/contents?ref={branch}") - ) as res: - base = json.loads(res.read().decode()) - records = [] - for folder in base: - if folder["path"].startswith(".") or folder["size"] > 0: - # drop hidden folders and other files. - continue - with urlopen(audit_url(folder["url"])) as res: # noqa: S310 - listing = json.loads(res.read().decode()) - for file in listing: - if file["path"].endswith(".nc"): - records.append( - { - "name": file["path"], - "size": file["size"] / 2**10, - "url": file["html_url"], - } - ) - df = pd.DataFrame.from_records(records).set_index("name") - print(f"Found {len(df)} datasets.") - return df - - def list_input_variables( submodules: Sequence[str] | None = None, realms: Sequence[str] | None = None ) -> dict: From 8d14fa4d625ff50b7617bbed4b2acba32399473e Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Mon, 19 Aug 2024 17:12:28 -0400 Subject: [PATCH 02/21] wip - more refactoring --- tests/conftest.py | 51 ++++++------- tests/test_partitioning.py | 16 +++- tests/test_testing_utils.py | 35 ++++----- xclim/ensembles/_partitioning.py | 2 +- xclim/testing/helpers.py | 122 +++++++++++++++---------------- xclim/testing/registry.txt | 51 +++++++++++++ xclim/testing/utils.py | 21 +++--- 7 files changed, 174 insertions(+), 124 deletions(-) create mode 100644 xclim/testing/registry.txt diff --git a/tests/conftest.py b/tests/conftest.py index 32f7c978b..95c36f4eb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,9 +13,9 @@ from xclim.core import indicator from xclim.core.calendar import max_doy from xclim.testing import helpers +from xclim.testing.helpers import nimbus as _nimbus from xclim.testing.helpers import test_timeseries -from xclim.testing.utils import _default_cache_dir # noqa -from xclim.testing.utils import get_file +from xclim.testing.utils import default_cache_dir # noqa from xclim.testing.utils import open_dataset as _open_dataset @@ -25,15 +25,24 @@ def random() -> np.random.Generator: @pytest.fixture -def tmp_netcdf_filename(tmpdir) -> Path: +def tmp_netcdf_filename(tmpdir): yield Path(tmpdir).joinpath("testfile.nc") @pytest.fixture(autouse=True, scope="session") -def threadsafe_data_dir(tmp_path_factory) -> Path: +def threadsafe_data_dir(tmp_path_factory): yield Path(tmp_path_factory.getbasetemp().joinpath("data")) +@pytest.fixture(autouse=True, scope="session") +def nimbus(threadsafe_data_dir): + yield _nimbus( + data_dir=threadsafe_data_dir, + repo=helpers.TESTDATA_REPO_URL, + branch=helpers.TESTDATA_BRANCH, + ) + + @pytest.fixture def lat_series(): def _lat_series(values): @@ -312,6 +321,14 @@ def cmip3_day_tas(threadsafe_data_dir): ds.close() +@pytest.fixture(scope="session") +def get_file(nimbus): + def _get_session_scoped_file(file: str): + nimbus.fetch(file) + + return _get_session_scoped_file + + @pytest.fixture(scope="session") def open_dataset(threadsafe_data_dir): def _open_session_scoped_file( @@ -361,30 +378,6 @@ def ensemble_dataset_objects() -> dict[str, str]: return edo -@pytest.fixture(scope="session") -def lafferty_sriver_ds() -> xr.Dataset: - """Get data from Lafferty & Sriver unit test. - - Notes - ----- - https://github.com/david0811/lafferty-sriver_2023_npjCliAtm/tree/main/unit_test - """ - fn = get_file( - "uncertainty_partitioning/seattle_avg_tas.csv", - cache_dir=_default_cache_dir, - branch=helpers.TESTDATA_BRANCH, - ) - - df = pd.read_csv(fn, parse_dates=["time"]).rename( - columns={"ssp": "scenario", "ensemble": "downscaling"} - ) - - # Make xarray dataset - return xr.Dataset.from_dataframe( - df.set_index(["scenario", "model", "downscaling", "time"]) - ) - - @pytest.fixture(scope="session", autouse=True) def gather_session_data(threadsafe_data_dir, worker_id): """Gather testing data on pytest run. @@ -408,7 +401,7 @@ def cleanup(request): """ def remove_data_written_flag(): - flag = _default_cache_dir.joinpath(".data_written") + flag = default_cache_dir.joinpath(".data_written") if flag.exists(): flag.unlink() diff --git a/tests/test_partitioning.py b/tests/test_partitioning.py index f34691985..54e27d823 100644 --- a/tests/test_partitioning.py +++ b/tests/test_partitioning.py @@ -1,6 +1,7 @@ from __future__ import annotations import numpy as np +import pandas as pd import xarray as xr from xclim.ensembles import fractional_uncertainty, hawkins_sutton, lafferty_sriver @@ -107,8 +108,19 @@ def test_lafferty_sriver_synthetic(random): lafferty_sriver(da, sm=sm) -def test_lafferty_sriver(lafferty_sriver_ds): - _g, u = lafferty_sriver(lafferty_sriver_ds.tas) +def test_lafferty_sriver(get_file): + seattle = get_file("uncertainty_partitioning/seattle_avg_tas.csv") + + df = pd.read_csv(seattle, parse_dates=["time"]).rename( + columns={"ssp": "scenario", "ensemble": "downscaling"} + ) + + # Make xarray dataset + ds = xr.Dataset.from_dataframe( + df.set_index(["scenario", "model", "downscaling", "time"]) + ) + + _g, u = lafferty_sriver(ds.tas) fu = fractional_uncertainty(u) diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 3bbc044e3..63e0881a0 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -29,6 +29,16 @@ def test_timeseries_made_up_variable(self): class TestFileRequests: + + @staticmethod + def file_md5_checksum(f_name): + import hashlib + + hash_md5 = hashlib.md5() # noqa: S324 + with open(f_name, "rb") as f: + hash_md5.update(f.read()) + return hash_md5.hexdigest() + @pytest.mark.requires_internet def test_get_failure(self, tmp_path): bad_repo_address = "https://github.com/beard/of/zeus/" @@ -37,7 +47,6 @@ def test_get_failure(self, tmp_path): Path("san_diego", "60_percent_of_the_time_it_works_everytime"), bad_repo_address, "main", - ".rudd", tmp_path, ) @@ -58,20 +67,18 @@ def test_open_dataset_with_bad_file(self, tmp_path): new_cmip3_file = utilities._get( Path("cmip3", cmip3_file), github_url="https://github.com/Ouranosinc/xclim-testdata", - suffix=".nc", branch="main", cache_dir=tmp_path, ) # Ensure that the new cmip3 file is in the cache directory assert ( - utilities.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) - != bad_cmip3_md5 + self.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) != bad_cmip3_md5 ) # Ensure that the md5 file was updated at the same time assert ( - utilities.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) + self.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) == Path(cmip3_folder, cmip3_md5).read_text() ) @@ -82,26 +89,10 @@ def test_open_testdata(self): ) assert ds.lon.size == 128 - # Not that this test is super slow, but there is no real value in spamming GitHub's API for no reason. - @pytest.mark.slow - @pytest.mark.xfail(reason="Test is rate limited by GitHub.") - def test_list_datasets(self): - out = utilities.list_datasets() - - assert list(out.columns) == ["size", "url"] - np.testing.assert_allclose( - out.loc["cmip6/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.nc"][ - "size" - ], - 845.021484375, - ) - - -class TestFileAssertions: def test_md5_sum(self): test_data = Path(__file__).parent / "data" callendar = test_data / "callendar_1938.txt" - md5_sum = utilities.file_md5_checksum(callendar) + md5_sum = self.file_md5_checksum(callendar) if sys.platform == "win32": # Windows has a different line ending (CR-LF) than Unix (LF) assert md5_sum == "38083271c2d4c85dea6bd6baf23d34de" # noqa diff --git a/xclim/ensembles/_partitioning.py b/xclim/ensembles/_partitioning.py index ce957d672..5c18b5102 100644 --- a/xclim/ensembles/_partitioning.py +++ b/xclim/ensembles/_partitioning.py @@ -197,7 +197,7 @@ def hawkins_sutton_09_weighting(da, obs, baseline=("1971", "2000")): def lafferty_sriver( da: xr.DataArray, - sm: xr.DataArray = None, + sm: xr.DataArray | None = None, bb13: bool = False, ) -> tuple[xr.DataArray, xr.DataArray]: """Return the mean and partitioned variance of an ensemble based on method from Lafferty and Sriver (2023). diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index d50f23b5c..8b1b687a1 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -20,7 +20,7 @@ import pandas as pd import pooch import xarray as xr -from dask.diagnostics import Callback +from dask.callbacks import Callback from filelock import FileLock from packaging.version import Version @@ -37,9 +37,12 @@ longwave_upwelling_radiation_from_net_downwelling, shortwave_upwelling_radiation_from_net_downwelling, ) -from xclim.testing.utils import _default_cache_dir # noqa +from xclim.testing.utils import default_cache_dir from xclim.testing.utils import open_dataset as _open_dataset +default_testdata_version = "v2023.12.14" + + TESTDATA_REPO_URL = str( os.getenv("XCLIM_TESTDATA_REPO_URL", "https://github.com/Ouranosinc/xclim-testdata") ) @@ -99,7 +102,7 @@ $ env XCLIM_PREFETCH_TESTING_DATA=1 pytest """ -CACHE_DIR = os.getenv("XCLIM_DATA_DIR", _default_cache_dir) +CACHE_DIR = os.getenv("XCLIM_DATA_DIR", default_cache_dir) """Sets the directory to store the testing datasets. If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). @@ -139,12 +142,8 @@ $ env XCLIM_DATA_UPDATES=True pytest """ - -DATA_URL = f"{TESTDATA_REPO_URL}/raw/{TESTDATA_BRANCH}/data" - __all__ = [ "DATA_UPDATES", - "DATA_URL", "PREFETCH_TESTING_DATA", "TESTDATA_BRANCH", "add_example_file_paths", @@ -156,11 +155,11 @@ def testing_setup_warnings(): """Warn users about potential incompatibilities between xclim and xclim-testdata versions.""" - if re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) and TESTDATA_BRANCH == "main": + if re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) and TESTDATA_BRANCH != "main": # This does not need to be emitted on GitHub Workflows and ReadTheDocs if not os.getenv("CI") and not os.getenv("READTHEDOCS"): warnings.warn( - f'`xclim` {__xclim_version__} is running tests against the "main" branch of the testing data. ' + f"`xclim` stable ({__xclim_version__}) is running tests against a non-default branch of the testing data. " "It is possible that changes to the testing data may be incompatible with some assertions in this version. " f"Please be sure to check {TESTDATA_REPO_URL} for more information.", ) @@ -183,7 +182,9 @@ def testing_setup_warnings(): def load_registry( - file: str | Path | None = None, remote: str = DATA_URL + file: str | Path | None = None, + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, ) -> dict[str, str]: """Load the registry file for the test data. @@ -191,36 +192,28 @@ def load_registry( ---------- file : str or Path, optional Path to the registry file. If not provided, the registry file found within the package data will be used. - remote : str - URL to the remote registry folder. Returns ------- dict Dictionary of filenames and hashes. """ - - def _fetcher(f: str, r: str, c: str) -> str: - try: - return pooch.retrieve( - url=f"{r}/{f}", - known_hash=None, - path=c, - fname="registry.txt", - ) - except HTTPError: - raise - except SocketBlockedError: - raise + remote = f"{repo}/raw/{branch}/data" # Get registry file from package_data if file is None: registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) if not registry_file.exists(): registry_file.touch() + url = f"{remote}/{registry_file.name}" try: with tempfile.TemporaryDirectory() as tempdir: - remote_registry_file = _fetcher(registry_file.name, remote, tempdir) + remote_registry_file = pooch.retrieve( + url=url, + known_hash=None, + path=tempdir, + fname="registry.txt", + ) # Check if the local registry file matches the remote registry if pooch.file_hash(remote_registry_file) != pooch.file_hash( registry_file.as_posix() @@ -262,8 +255,9 @@ def _fetcher(f: str, r: str, c: str) -> str: def nimbus( # noqa: PR01 data_dir: str | Path = CACHE_DIR, data_updates: bool = DATA_UPDATES, - data_url: str = DATA_URL, -): + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, +) -> pooch.Pooch: """Pooch registry instance for xhydro test data. Parameters @@ -272,8 +266,10 @@ def nimbus( # noqa: PR01 Path to the directory where the data files are stored. data_updates : bool If True, allow updates to the data files. - data_url : str - Base URL to download the data files. + repo : str + URL of the repository to use when fetching testing datasets. + branch : str + Branch of repository to use when fetching testing datasets. Returns ------- @@ -288,7 +284,10 @@ def nimbus( # noqa: PR01 the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). - ``XCLIM_DATA_UPDATES``: If this environment variable is set, then the data files will be downloaded even if the upstream hashes do not match. This is useful if you want to always use the latest version of the data files. - - ``XCLIM_DATA_URL``: If this environment variable is set, it will be used as the base URL to download the data files. + - ``XCLIM_TESTDATA_REPO_URL``: If this environment variable is set, it will be used as the URL of the repository + to use when fetching datasets. Otherwise, the default repository will be used. + - ``XCLIM_TESTDATA_BRANCH``: If this environment variable is set, it will be used as the branch of the repository + to use when fetching datasets. Otherwise, the default branch will be used. Examples -------- @@ -302,37 +301,36 @@ def nimbus( # noqa: PR01 example_file = nimbus().fetch("example.nc") data = xr.open_dataset(example_file) """ + remote = f"{repo}/raw/{branch}/data" + return pooch.create( path=data_dir, - base_url=data_url, - version=__xclim_version__, - version_dev="main", + base_url=remote, + version=default_testdata_version, + version_dev=branch, allow_updates=data_updates, - registry=load_registry(remote=data_url), + registry=load_registry(repo=repo, branch=branch), ) def populate_testing_data( registry_file: str | Path | None = None, temp_folder: Path | None = None, - branch: str = TESTDATA_BRANCH, - _data_url: str = DATA_URL, - _local_cache: Path = _default_cache_dir, + repo: str | None = None, + branch: str | None = None, + local_cache: Path = default_cache_dir, ) -> None: """Populate the local cache with the testing data. Parameters ---------- - registry_file : str or Path, optional - Path to the registry file. If not provided, the registry file from package_data will be used. temp_folder : Path, optional Path to a temporary folder to use as the local cache. If not provided, the default location will be used. + repo : str, optional + URL of the repository to use when fetching testing datasets. branch : str, optional Branch of hydrologie/xhydro-testdata to use when fetching testing datasets. - _data_url : Path - URL for the testing data. - Set via the `DATA_URL` environment variable ({TESTDATA_REPO_URL}/raw/{TESTDATA_BRANCH}/data). - _local_cache : Path + local_cache : Path Path to the local cache. Defaults to the location set by the platformdirs library. The testing data will be downloaded to this local cache. @@ -340,19 +338,24 @@ def populate_testing_data( ------- None """ - # Get registry file from package_data or provided path - registry = load_registry(registry_file) - # Set the local cache to the temp folder + if repo is None: + _repo = TESTDATA_REPO_URL + else: + _repo = repo + if branch is None: + _branch = TESTDATA_BRANCH + else: + _branch = branch if temp_folder is not None: _local_cache = temp_folder + else: + _local_cache = Path(local_cache) # Create the Pooch instance - n = nimbus(data_url=_data_url) + n = nimbus(data_dir=_local_cache, repo=_repo, branch=_branch) - # Set the branch - n.version_dev = branch - # Set the local cache - n.path = _local_cache + # Load the registry file + registry = load_registry(file=registry_file, repo=_repo, branch=_branch) # Download the files errored_files = [] @@ -363,7 +366,7 @@ def populate_testing_data( msg = f"File `{file}` not accessible in remote repository." logging.error(msg) errored_files.append(file) - except SocketBlockedError as e: + except SocketBlockedError as e: # noqa msg = ( "Unable to access registry file online. Testing suite is being run with `--disable-socket`. " "If you intend to run tests with this option enabled, please download the file beforehand with the " @@ -421,10 +424,7 @@ def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): """Gather testing data across workers.""" - if ( - not _default_cache_dir.joinpath(TESTDATA_BRANCH).exists() - or PREFETCH_TESTING_DATA - ): + if not default_cache_dir.exists() or PREFETCH_TESTING_DATA: if PREFETCH_TESTING_DATA: print("`XCLIM_PREFETCH_TESTING_DATA` set. Prefetching testing data...") if platform == "win32": @@ -435,17 +435,17 @@ def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): elif worker_id in ["master"]: populate_testing_data(branch=TESTDATA_BRANCH) else: - _default_cache_dir.mkdir(exist_ok=True, parents=True) - lockfile = _default_cache_dir.joinpath(".lock") + default_cache_dir.mkdir(exist_ok=True, parents=True) + lockfile = default_cache_dir.joinpath(".lock") test_data_being_written = FileLock(lockfile) with test_data_being_written: # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run populate_testing_data(branch=TESTDATA_BRANCH) - _default_cache_dir.joinpath(".data_written").touch() + default_cache_dir.joinpath(".data_written").touch() with test_data_being_written.acquire(): if lockfile.exists(): lockfile.unlink() - copytree(_default_cache_dir, threadsafe_data_dir) + copytree(default_cache_dir, threadsafe_data_dir) def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: diff --git a/xclim/testing/registry.txt b/xclim/testing/registry.txt new file mode 100644 index 000000000..ec0fbbfd4 --- /dev/null +++ b/xclim/testing/registry.txt @@ -0,0 +1,51 @@ +CanESM2_365day/pr_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:16dafec260dd74bf38f87482baa34cc35a1689facfb5557ebfc7d2c928618fc7 +CanESM2_365day/tasmax_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:0c57c56e38a9e5b0623180c3def9406e9ddabbe7b1c01b282f1a34c4a61ea357 +CanESM2_365day/tasmin_day_CanESM2_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:5d43ec47759bf9d118942277fe8d7c632765c3a0ba02dc828b0610e1f2030a63 +cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc sha256:e709552beeeccafcfe280759edf5477ae5241c698409ca051b0899c16e92c95e +cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc sha256:7471770e4e654997225ab158f2b24aa0510b6f06006fb757b9ea7c0d4a47e1f2 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_200512-203011.nc sha256:3cb54d67bf89cdf542a7b93205785da3800f9a77eaa8436f4ee74af13b248b95 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_203012-205511.nc sha256:31b9a4139574012acbc9d7fdb210af8d00d45119a9b98ebcab67905262543c6d +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_205512-208011.nc sha256:8c18253f8039dfda0aba71f69e5fde367453fc8a239936ee54c6d32db184f3b9 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_208012-209912.nc sha256:bd7e419c8d6b60dbe700517a16453f787b147bb15cfdebf0519e882fa967f5a0 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_209912-212411.nc sha256:54dda14b6c2d8dce8e3a2ff526ffba8cc54bf5de5ace96eec93d060256fd63b6 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_212412-214911.nc sha256:35791a451c392d3dae69ecb789c4a952eff761dddab934389c7d0686feeb6e72 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_214912-217411.nc sha256:156577a84d82c23f65e019ba58fcdbb7677f1a1128f4745d72441896d0485a11 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_217412-219911.nc sha256:b6378f082aa6d877fae46be9663e1fe3bf82e0d596aaf501afa6217fcc300878 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_219912-222411.nc sha256:21c8db59941ad5481433b69eae5c9efed534c0fc35062ab767a481be9da503b6 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_222412-224911.nc sha256:e8d406cc7b87d0899236610e1a9ddecde8279d0d26316114496f159565fb78ba +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_224912-227411.nc sha256:abbe16349870c501335f7f17a5372703f82e8db84f911d29c31783bb07100e6e +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_227412-229911.nc sha256:ecf52dc8ac13e04d0b643fc53cc5b367b32e68a311e6718686eaa87088788f98 +cmip5/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_229912-229912.nc sha256:3fa657483072d8a04363b8718bc9c4e63e6354617a4ab3d627b25222a4cd094c +cmip6/o3_Amon_GFDL-ESM4_historical_r1i1p1f1_gr1_185001-194912.nc sha256:cfff189d4986289efb2b88f418cd6d65b26b59355b67b73ca26ac8fa12a9f83f +cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc sha256:58a03aa401f80751ad60c8950f14bcf717aeb6ef289169cb5ae3081bb4689825 +CRCM5/tasmax_bby_198406_se.nc sha256:9a80cc19ed212428ef90ce0cc40790fbf0d1fc301df0abdf578da45843dae93d +EnsembleReduce/TestEnsReduceCriteria.nc sha256:ae7a70b9d5c54ab072f1cfbfab91d430a41c5067db3c1968af57ea2122cfe8e7 +EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc sha256:ca0cc893cf91db7c6dfe3df10d605684eabbea55b7e26077c10142d302e55aed +EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc sha256:c796276f563849c31bf388a3beb4a440eeb72062a84b4cf9760c854d1e990ca4 +EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc sha256:9cfa9bc4e81e936eb680a55db428ccd9f0a6d366d4ae2c4a9064bfa5d71e5ca7 +EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc sha256:ca36aafb3c63ddb6bfc8537abb854b71f719505c1145d5c81c3315eb1a13647c +EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc sha256:623eab96d75d8cc8abd59dfba1c14cfb06fd7c0fe9ce86788d3c8b0891684df2 +ERA5/daily_surface_cancities_1990-1993.nc sha256:049d54ace3d229a96cc621189daa3e1a393959ab8d988221cfc7b2acd7ab94b2 +FWI/GFWED_sample_2017.nc sha256:cf3bde795825663894fa7619a028d5a14fee307c623968235f25393f7afe159e +FWI/cffdrs_test_fwi.nc sha256:147be24e080aa67f17261f61f05a5dfb381a66a23785a327e47e2303667ca3ab +FWI/cffdrs_test_wDC.nc sha256:ebadcad1dd6a1a1e93c29a1143d7caefd46593ea2fbeb721015245981cce90c3 +HadGEM2-CC_360day/pr_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:c45ff4c17ba9fd92392bb08a7705789071a0bec40bde48f5a838ff12413cc33b +HadGEM2-CC_360day/tasmax_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:aa3eb54ea69bb00330de1037a48ac13dbc5b72f346c801d97731dec8260f400c +HadGEM2-CC_360day/tasmin_day_HadGEM2-CC_rcp85_r1i1p1_na10kgrid_qm-moving-50bins-detrend_2095.nc sha256:5c8fa666603fd68f614d95ac8c5a0dbdfb9f8e2e86666a270516a38526c1aa20 +NRCANdaily/nrcan_canada_daily_pr_1990.nc sha256:144479ec7a976cfecb6a10762d128a771356093d72caf5f075508ee86d25a1b0 +NRCANdaily/nrcan_canada_daily_tasmax_1990.nc sha256:84880205b798740e37a102c7f40e595d7a4fde6e35fb737a1ef68b8dad447526 +NRCANdaily/nrcan_canada_daily_tasmin_1990.nc sha256:13d61fc54cdcb4c1617ec777ccbf59575d8fdc24754f914042301bc1b024d7f7 +Raven/q_sim.nc sha256:f7a0ae73c498235e1c3e7338a184c5ca3729941b81521e606aa60b2c639f6e71 +sdba/CanESM2_1950-2100.nc sha256:b41fe603676e70d16c747ec207eb75ec86a39b665de401dcb23b5969ab3e1b32 +sdba/adjusted_external.nc sha256:ff325c88eca96844bc85863744e4e08bcdf3d257388255636427ad5e11960d2e +sdba/ahccd_1950-2013.nc sha256:7e9a1f61c1d04ca257b09857a82715f1fa3f0550d77f97b7306d4eaaf0c70239 +sdba/nrcan_1950-2013.nc sha256:4ce2dcfdac09b028db0f3e348272a496d796c36d4f3c4a412ebcca11449b7237 +uncertainty_partitioning/cmip5_pr_global_mon.nc sha256:7e585c995e95861979fd23dd9346f78a879403ea1d1d15acaa627802b4c5f1f4 +uncertainty_partitioning/cmip5_pr_pnw_mon.nc sha256:1cdfe74f5bd5cf71cd0737c190277821ea90e4e79de5b37367bf2b82c35a66c9 +uncertainty_partitioning/cmip5_tas_global_mon.nc sha256:41ba79a43bab169a0487e3f3f66a68a699bef9355a13e26a87fdb65744555cb5 +uncertainty_partitioning/cmip5_tas_pnw_mon.nc sha256:eeb48765fd430186f3634e7f779b4be45ab3df73e806a4cbb743fefb13279398 +SpatialAnalogs/CanESM2_ScenGen_Chibougamau_2041-2070.nc sha256:b6cfc4a963d68b6da8978acd26ffb506f33c9c264d8057badd90bf47cd9f3f3d +SpatialAnalogs/NRCAN_SECan_1981-2010.nc sha256:bde680ddad84106caad3a2e83a70ecdd8138578a70e875d77c2ec6d3ff868fee +SpatialAnalogs/dissimilarity.nc sha256:200ab9b7d43d41e6db917c54d35b43e3c5853e0df701e44efd5b813e47590110 +SpatialAnalogs/indicators.nc sha256:3bcbb0e4540d4badc085ac42b9d04a353e815fb55c62271eb73275b889c80a15 +uncertainty_partitioning/seattle_avg_tas.csv sha256:157d6721f9925eec8268848e34548df2b1da50935f247a9b136d251ef53898d7 diff --git a/xclim/testing/utils.py b/xclim/testing/utils.py index dd142429a..b396c4a99 100644 --- a/xclim/testing/utils.py +++ b/xclim/testing/utils.py @@ -20,7 +20,7 @@ from urllib.parse import urljoin, urlparse from urllib.request import urlretrieve -from platformdirs import user_cache_dir +import pooch from xarray import Dataset from xarray import open_dataset as _open_dataset @@ -51,14 +51,14 @@ "boltons", ] - -_default_cache_dir = Path(user_cache_dir("xclim-testdata")) +default_cache_dir = Path(pooch.os_cache("xclim-testdata")) +"""Default location for the testing data cache.""" logger = logging.getLogger("xclim") __all__ = [ - "_default_cache_dir", "audit_url", + "default_cache_dir", "list_input_variables", "open_dataset", "publish_release_notes", @@ -67,7 +67,7 @@ ] -def audit_url(url: str, context: str = None) -> str: +def audit_url(url: str, context: str | None = None) -> str: """Check if the URL is well-formed. Raises @@ -104,13 +104,16 @@ def _get( # This will always leave this directory on disk. # We may want to add an option to remove it. local_file.parent.mkdir(exist_ok=True, parents=True) - url = "/".join((github_url, "raw", branch, name.as_posix())) + url = "/".join((github_url, "raw", branch, "data", name.as_posix())) msg = f"Fetching remote file: {name.as_posix()}" logger.info(msg) try: urlretrieve(audit_url(url), local_file) # noqa: S310 except HTTPError as e: - msg = f"{name.as_posix()} not accessible in remote repository. Aborting file retrieval." + msg = ( + f"{name.as_posix()} not accessible in remote repository: {url}. " + "Aborting file retrieval." + ) raise FileNotFoundError(msg) from e except SocketBlockedError as e: msg = ( @@ -127,10 +130,10 @@ def _get( def open_dataset( name: str | os.PathLike[str], dap_url: str | None = None, - github_url: str = "https://github.com/Ouranosinc/xclim-testdata/data", + github_url: str = "https://github.com/Ouranosinc/xclim-testdata", branch: str = "main", cache: bool = True, - cache_dir: Path = _default_cache_dir, + cache_dir: Path = default_cache_dir, **kwargs, ) -> Dataset: r"""Open a dataset from the online GitHub-like repository. From e4609a09b8103fa0d26a572d5d93882f0f3e3129 Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:20:42 -0400 Subject: [PATCH 03/21] wip - first working version of major refactoring --- pyproject.toml | 4 +- tests/conftest.py | 117 +++++++------- tests/test_analog.py | 10 +- tests/test_atmos.py | 2 +- tests/test_indices.py | 8 +- tests/test_partitioning.py | 16 +- tests/test_testing_utils.py | 68 +-------- xclim/cli.py | 12 +- xclim/testing/conftest.py | 5 +- xclim/testing/helpers.py | 294 +++++++++++++++++------------------- xclim/testing/utils.py | 148 +----------------- 11 files changed, 223 insertions(+), 461 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3ea17821f..628356fb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = [ "packaging >=24.0", "pandas >=2.2", "pint >=0.18", - "platformdirs >=3.2", + "pooch >=1.8.0", "pyarrow >=15.0.0", # Strongly encouraged for pandas v2.2.0+ "pyyaml >=6.0.1", "scikit-learn >=0.21.3", @@ -79,8 +79,6 @@ dev = [ "nbval >=0.11.0", "pandas-stubs >=2.2", "pip >=24.0", - "platformdirs >=3.2", - "pooch >=1.8.0", "pre-commit >=3.7", "pylint >=3.2.4", "pytest >=8.0.0", diff --git a/tests/conftest.py b/tests/conftest.py index 95c36f4eb..e98fb784f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,10 +13,10 @@ from xclim.core import indicator from xclim.core.calendar import max_doy from xclim.testing import helpers +from xclim.testing.helpers import default_cache_dir # noqa from xclim.testing.helpers import nimbus as _nimbus +from xclim.testing.helpers import open_dataset as _open_dataset from xclim.testing.helpers import test_timeseries -from xclim.testing.utils import default_cache_dir # noqa -from xclim.testing.utils import open_dataset as _open_dataset @pytest.fixture @@ -26,21 +26,7 @@ def random() -> np.random.Generator: @pytest.fixture def tmp_netcdf_filename(tmpdir): - yield Path(tmpdir).joinpath("testfile.nc") - - -@pytest.fixture(autouse=True, scope="session") -def threadsafe_data_dir(tmp_path_factory): - yield Path(tmp_path_factory.getbasetemp().joinpath("data")) - - -@pytest.fixture(autouse=True, scope="session") -def nimbus(threadsafe_data_dir): - yield _nimbus( - data_dir=threadsafe_data_dir, - repo=helpers.TESTDATA_REPO_URL, - branch=helpers.TESTDATA_BRANCH, - ) + return Path(tmpdir).joinpath("testfile.nc") @pytest.fixture @@ -57,6 +43,11 @@ def _lat_series(values): return _lat_series +@pytest.fixture +def timeseries(): + return test_timeseries + + @pytest.fixture def tas_series(): """Return mean temperature time series.""" @@ -309,40 +300,30 @@ def rlus_series(): @pytest.fixture(scope="session") -def cmip3_day_tas(threadsafe_data_dir): - # xr.set_options(enable_cftimeindex=False) - ds = _open_dataset( - "cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc", - cache_dir=threadsafe_data_dir, - branch=helpers.TESTDATA_BRANCH, - engine="h5netcdf", - ) - yield ds.tas - ds.close() +def threadsafe_data_dir(tmp_path_factory): + return Path(tmp_path_factory.getbasetemp().joinpath("data")) @pytest.fixture(scope="session") -def get_file(nimbus): - def _get_session_scoped_file(file: str): - nimbus.fetch(file) - - return _get_session_scoped_file +def nimbus(threadsafe_data_dir): + return _nimbus( + data_dir=threadsafe_data_dir, + repo=helpers.TESTDATA_REPO_URL, + branch=helpers.TESTDATA_BRANCH, + ) @pytest.fixture(scope="session") -def open_dataset(threadsafe_data_dir): - def _open_session_scoped_file( - file: str | os.PathLike, branch: str = helpers.TESTDATA_BRANCH, **xr_kwargs - ): +def open_dataset(nimbus): + def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): + xr_kwargs.setdefault("cache", True) xr_kwargs.setdefault("engine", "h5netcdf") - return _open_dataset( - file, cache_dir=threadsafe_data_dir, branch=branch, **xr_kwargs - ) + return _open_dataset(file, cache_dir=nimbus.path, **xr_kwargs) return _open_session_scoped_file -@pytest.fixture +@pytest.fixture(scope="session") def official_indicators(): # Remove unofficial indicators (as those created during the tests, and those from YAML-built modules) registry_cp = indicator.registry.copy() @@ -352,17 +333,39 @@ def official_indicators(): return registry_cp -@pytest.fixture(scope="function") -def atmosds(threadsafe_data_dir) -> xr.Dataset: +@pytest.fixture +def lafferty_sriver_ds(nimbus) -> xr.Dataset: + """Get data from Lafferty & Sriver unit test. + + Notes + ----- + https://github.com/david0811/lafferty-sriver_2023_npjCliAtm/tree/main/unit_test + """ + fn = nimbus.fetch( + "uncertainty_partitioning/seattle_avg_tas.csv", + ) + + df = pd.read_csv(fn, parse_dates=["time"]).rename( + columns={"ssp": "scenario", "ensemble": "downscaling"} + ) + + # Make xarray dataset + return xr.Dataset.from_dataframe( + df.set_index(["scenario", "model", "downscaling", "time"]) + ) + + +@pytest.fixture +def atmosds(nimbus) -> xr.Dataset: + """Get synthetic atmospheric dataset.""" return _open_dataset( - threadsafe_data_dir.joinpath("atmosds.nc"), - cache_dir=threadsafe_data_dir, - branch=helpers.TESTDATA_BRANCH, + "atmosds.nc", + cache_dir=nimbus.path, engine="h5netcdf", ).load() -@pytest.fixture(scope="function") +@pytest.fixture(scope="session") def ensemble_dataset_objects() -> dict[str, str]: edo = dict() edo["nc_files_simple"] = [ @@ -378,8 +381,8 @@ def ensemble_dataset_objects() -> dict[str, str]: return edo -@pytest.fixture(scope="session", autouse=True) -def gather_session_data(threadsafe_data_dir, worker_id): +@pytest.fixture(autouse=True, scope="session") +def gather_session_data(request, nimbus, worker_id): """Gather testing data on pytest run. When running pytest with multiple workers, one worker will copy data remotely to _default_cache_dir while @@ -389,25 +392,13 @@ def gather_session_data(threadsafe_data_dir, worker_id): Additionally, this fixture is also used to generate the `atmosds` synthetic testing dataset. """ helpers.testing_setup_warnings() - helpers.gather_testing_data(threadsafe_data_dir, worker_id) - helpers.generate_atmos(threadsafe_data_dir) - - -@pytest.fixture(scope="session", autouse=True) -def cleanup(request): - """Cleanup a testing file once we are finished. - - This flag prevents remote data from being downloaded multiple times in the same pytest run. - """ + helpers.gather_testing_data(nimbus.path, worker_id) + helpers.generate_atmos(nimbus.path) def remove_data_written_flag(): + """Cleanup cache folder once we are finished.""" flag = default_cache_dir.joinpath(".data_written") if flag.exists(): flag.unlink() request.addfinalizer(remove_data_written_flag) - - -@pytest.fixture -def timeseries(): - return test_timeseries diff --git a/tests/test_analog.py b/tests/test_analog.py index 72857b007..2608df226 100644 --- a/tests/test_analog.py +++ b/tests/test_analog.py @@ -58,8 +58,8 @@ def test_exact_randn(exact_randn): @pytest.mark.slow @pytest.mark.parametrize("method", xca.metrics.keys()) def test_spatial_analogs(method, open_dataset): - diss = open_dataset("SpatialAnalogs/dissimilarity") - data = open_dataset("SpatialAnalogs/indicators") + diss = open_dataset("SpatialAnalogs/dissimilarity.nc") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) @@ -75,7 +75,7 @@ def test_spatial_analogs(method, open_dataset): def test_unsupported_spatial_analog_method(open_dataset): method = "KonMari" - data = open_dataset("SpatialAnalogs/indicators") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) @@ -87,8 +87,8 @@ def test_unsupported_spatial_analog_method(open_dataset): def test_spatial_analogs_multi_index(open_dataset): # Test multi-indexes - diss = open_dataset("SpatialAnalogs/dissimilarity") - data = open_dataset("SpatialAnalogs/indicators") + diss = open_dataset("SpatialAnalogs/dissimilarity.nc") + data = open_dataset("SpatialAnalogs/indicators.nc") target = data.sel(lat=46.1875, lon=-72.1875, time=slice("1970", "1990")) candidates = data.sel(time=slice("1970", "1990")) diff --git a/tests/test_atmos.py b/tests/test_atmos.py index 10d5d0efe..23929550d 100644 --- a/tests/test_atmos.py +++ b/tests/test_atmos.py @@ -94,7 +94,7 @@ def test_humidex(tas_series): def test_heat_index(atmosds): - # Keep just Montreal values for summertime as we need tas > 20 degC + # Keep just Montreal values for summer as we need tas > 20 degC tas = atmosds.tasmax[1][150:170] hurs = atmosds.hurs[1][150:170] diff --git a/tests/test_indices.py b/tests/test_indices.py index a9386087f..69142a077 100644 --- a/tests/test_indices.py +++ b/tests/test_indices.py @@ -2562,12 +2562,14 @@ def test_simple(self, open_dataset, ind, exp): out = ind(ds.tas.sel(location="Victoria")) np.testing.assert_almost_equal(out[0], exp, decimal=4) - def test_indice_against_icclim(self, cmip3_day_tas): + def test_indice_against_icclim(self, open_dataset): from xclim.indicators import icclim # noqa + cmip3_tas = open_dataset("cmip3/tas.sresb1.giss_model_e_r.run1.atm.da.nc").tas + with set_options(cf_compliance="log"): - ind = xci.tg_mean(cmip3_day_tas) - icclim = icclim.TG(cmip3_day_tas) + ind = xci.tg_mean(cmip3_tas) + icclim = icclim.TG(cmip3_tas) np.testing.assert_array_equal(icclim, ind) diff --git a/tests/test_partitioning.py b/tests/test_partitioning.py index 54e27d823..f34691985 100644 --- a/tests/test_partitioning.py +++ b/tests/test_partitioning.py @@ -1,7 +1,6 @@ from __future__ import annotations import numpy as np -import pandas as pd import xarray as xr from xclim.ensembles import fractional_uncertainty, hawkins_sutton, lafferty_sriver @@ -108,19 +107,8 @@ def test_lafferty_sriver_synthetic(random): lafferty_sriver(da, sm=sm) -def test_lafferty_sriver(get_file): - seattle = get_file("uncertainty_partitioning/seattle_avg_tas.csv") - - df = pd.read_csv(seattle, parse_dates=["time"]).rename( - columns={"ssp": "scenario", "ensemble": "downscaling"} - ) - - # Make xarray dataset - ds = xr.Dataset.from_dataframe( - df.set_index(["scenario", "model", "downscaling", "time"]) - ) - - _g, u = lafferty_sriver(ds.tas) +def test_lafferty_sriver(lafferty_sriver_ds): + _g, u = lafferty_sriver(lafferty_sriver_ds.tas) fu = fractional_uncertainty(u) diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 63e0881a0..35646f12e 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -3,14 +3,14 @@ import platform import sys from pathlib import Path -from urllib.error import URLError import numpy as np import pytest from xarray import Dataset -import xclim.testing.utils as utilities from xclim import __version__ as __xclim_version__ +from xclim.testing import helpers +from xclim.testing import utils as utilities from xclim.testing.helpers import test_timeseries as timeseries @@ -39,52 +39,9 @@ def file_md5_checksum(f_name): hash_md5.update(f.read()) return hash_md5.hexdigest() - @pytest.mark.requires_internet - def test_get_failure(self, tmp_path): - bad_repo_address = "https://github.com/beard/of/zeus/" - with pytest.raises(FileNotFoundError): - utilities._get( - Path("san_diego", "60_percent_of_the_time_it_works_everytime"), - bad_repo_address, - "main", - tmp_path, - ) - - @pytest.mark.requires_internet - def test_open_dataset_with_bad_file(self, tmp_path): - cmip3_folder = tmp_path.joinpath("main", "cmip3") - cmip3_folder.mkdir(parents=True) - - cmip3_file = "tas.sresb1.giss_model_e_r.run1.atm.da.nc" - Path(cmip3_folder, cmip3_file).write_text("This file definitely isn't right.") - - cmip3_md5 = f"{cmip3_file}.md5" - bad_cmip3_md5 = "bc51206e6462fc8ed08fd4926181274c" - Path(cmip3_folder, cmip3_md5).write_text(bad_cmip3_md5) - - # Check for raised warning for local file md5 sum and remote md5 sum - with pytest.warns(UserWarning): - new_cmip3_file = utilities._get( - Path("cmip3", cmip3_file), - github_url="https://github.com/Ouranosinc/xclim-testdata", - branch="main", - cache_dir=tmp_path, - ) - - # Ensure that the new cmip3 file is in the cache directory - assert ( - self.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) != bad_cmip3_md5 - ) - - # Ensure that the md5 file was updated at the same time - assert ( - self.file_md5_checksum(Path(cmip3_folder, new_cmip3_file)) - == Path(cmip3_folder, cmip3_md5).read_text() - ) - @pytest.mark.requires_internet def test_open_testdata(self): - ds = utilities.open_dataset( + ds = helpers.open_dataset( Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712"), engine="h5netcdf" ) assert ds.lon.size == 128 @@ -126,22 +83,3 @@ def test_release_notes_file_not_implemented(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") with pytest.raises(NotImplementedError): utilities.publish_release_notes(style="qq", file=temp_filename) - - -class TestTestingFileAccessors: - def test_unsafe_urls(self): - with pytest.raises( - ValueError, match="GitHub URL not secure: 'ftp://domain.does.not.exist/'." - ): - utilities.open_dataset( - "doesnt_exist.nc", github_url="ftp://domain.does.not.exist/" - ) - - def test_malicious_urls(self): - with pytest.raises( - URLError, - match="urlopen error OPeNDAP URL is not well-formed: 'doesnt_exist.nc'", - ): - utilities.open_dataset( - "doesnt_exist.nc", dap_url="Robert'); DROP TABLE STUDENTS; --" - ) diff --git a/xclim/cli.py b/xclim/cli.py index 67a6da1eb..1df887b0d 100644 --- a/xclim/cli.py +++ b/xclim/cli.py @@ -11,13 +11,17 @@ import click import xarray as xr -from dask.diagnostics import ProgressBar +from dask.diagnostics.progress import ProgressBar import xclim as xc from xclim.core.dataflags import DataQualityException, data_flags, ecad_compliant from xclim.core.utils import InputKind -from xclim.testing.helpers import TESTDATA_BRANCH, populate_testing_data -from xclim.testing.utils import _default_cache_dir, publish_release_notes, show_versions +from xclim.testing.helpers import ( + TESTDATA_BRANCH, + default_cache_dir, + populate_testing_data, +) +from xclim.testing.utils import publish_release_notes, show_versions distributed = False try: @@ -169,7 +173,7 @@ def prefetch_testing_data(ctx, branch): f"Gathering testing data from xclim-testdata `{testdata_branch}` branch..." ) click.echo(populate_testing_data(branch=testdata_branch)) - click.echo(f"Testing data saved to `{_default_cache_dir}`.") + click.echo(f"Testing data saved to `{default_cache_dir}`.") ctx.exit() diff --git a/xclim/testing/conftest.py b/xclim/testing/conftest.py index 12af10934..7e175e975 100644 --- a/xclim/testing/conftest.py +++ b/xclim/testing/conftest.py @@ -11,12 +11,11 @@ import pytest from xclim.testing import helpers -from xclim.testing.utils import _default_cache_dir # noqa -from xclim.testing.utils import open_dataset as _open_dataset +from xclim.testing.helpers import open_dataset as _open_dataset @pytest.fixture(autouse=True, scope="session") -def threadsafe_data_dir(tmp_path_factory) -> Path: +def threadsafe_data_dir(tmp_path_factory): """Return a threadsafe temporary directory for storing testing data.""" yield Path(tmp_path_factory.getbasetemp().joinpath("data")) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 8b1b687a1..81d2ce31a 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -6,15 +6,14 @@ import logging import os import re -import shutil -import tempfile import time import warnings from datetime import datetime as dt from pathlib import Path from shutil import copytree from sys import platform -from urllib.error import HTTPError +from urllib.error import HTTPError, URLError +from urllib.parse import urljoin, urlparse import numpy as np import pandas as pd @@ -23,6 +22,8 @@ from dask.callbacks import Callback from filelock import FileLock from packaging.version import Version +from xarray import Dataset +from xarray import open_dataset as _open_dataset try: from pytest_socket import SocketBlockedError @@ -37,11 +38,14 @@ longwave_upwelling_radiation_from_net_downwelling, shortwave_upwelling_radiation_from_net_downwelling, ) -from xclim.testing.utils import default_cache_dir -from xclim.testing.utils import open_dataset as _open_dataset + +logger = logging.getLogger("xclim") default_testdata_version = "v2023.12.14" +"""Default version of the testing data to use when fetching datasets.""" +default_cache_dir = Path(pooch.os_cache("xclim-testdata")) +"""Default location for the testing data cache.""" TESTDATA_REPO_URL = str( os.getenv("XCLIM_TESTDATA_REPO_URL", "https://github.com/Ouranosinc/xclim-testdata") @@ -122,33 +126,17 @@ $ env XCLIM_DATA_DIR="/path/to/my/data" pytest """ -DATA_UPDATES = bool(os.getenv("XCLIM_DATA_UPDATES")) -"""Sets whether to allow updates to the testing datasets. - -If set to ``True``, the data files will be downloaded even if the upstream hashes do not match. - -Notes ------ -When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_DATA_UPDATES=True - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_DATA_UPDATES=True pytest -""" __all__ = [ - "DATA_UPDATES", "PREFETCH_TESTING_DATA", "TESTDATA_BRANCH", "add_example_file_paths", "assert_lazy", + "default_cache_dir", "generate_atmos", + "nimbus", + "open_dataset", + "populate_testing_data", "test_timeseries", ] @@ -181,100 +169,48 @@ def testing_setup_warnings(): ) -def load_registry( - file: str | Path | None = None, - repo: str = TESTDATA_REPO_URL, - branch: str = TESTDATA_BRANCH, -) -> dict[str, str]: +def load_registry() -> dict[str, str]: """Load the registry file for the test data. - Parameters - ---------- - file : str or Path, optional - Path to the registry file. If not provided, the registry file found within the package data will be used. - Returns ------- dict Dictionary of filenames and hashes. """ - remote = f"{repo}/raw/{branch}/data" - - # Get registry file from package_data - if file is None: - registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) - if not registry_file.exists(): - registry_file.touch() - url = f"{remote}/{registry_file.name}" - try: - with tempfile.TemporaryDirectory() as tempdir: - remote_registry_file = pooch.retrieve( - url=url, - known_hash=None, - path=tempdir, - fname="registry.txt", - ) - # Check if the local registry file matches the remote registry - if pooch.file_hash(remote_registry_file) != pooch.file_hash( - registry_file.as_posix() - ): - warnings.warn( - "Local registry file does not match remote registry file." - ) - shutil.move(remote_registry_file, registry_file) - except FileNotFoundError: - warnings.warn( - "Registry file not accessible in remote repository. " - "Aborting file retrieval and using local registry file." - ) - except SocketBlockedError: - warnings.warn( - "Testing suite is being run with `--disable-socket`. Using local registry file." - ) - if not registry_file.exists(): - raise FileNotFoundError( - f"Local registry file not found: {registry_file}. " - "Testing setup cannot proceed without registry file." - ) - else: - registry_file = Path(file) - if not registry_file.exists(): - raise FileNotFoundError(f"Registry file not found: {registry_file}") - - logging.info("Registry file found: %s", registry_file) + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + if not registry_file.exists(): + raise FileNotFoundError(f"Registry file not found: {registry_file}") # Load the registry file - registry = dict() - with registry_file.open() as buffer: - for entry in buffer.readlines(): - registry[entry.split()[0]] = entry.split()[1] - + with registry_file.open() as f: + registry = {line.split()[0]: line.split()[1] for line in f} return registry def nimbus( # noqa: PR01 data_dir: str | Path = CACHE_DIR, - data_updates: bool = DATA_UPDATES, repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, + data_updates: bool = True, ) -> pooch.Pooch: - """Pooch registry instance for xhydro test data. + """Pooch registry instance for xclim test data. Parameters ---------- data_dir : str or Path Path to the directory where the data files are stored. - data_updates : bool - If True, allow updates to the data files. + repo : str URL of the repository to use when fetching testing datasets. branch : str Branch of repository to use when fetching testing datasets. + data_updates : bool + If True, allow updates to the data files. Default is True. Returns ------- pooch.Pooch - Pooch instance for the xhydro test data. + Pooch instance for the xclim test data. Notes ----- @@ -282,8 +218,6 @@ def nimbus( # noqa: PR01 - ``XCLIM_DATA_DIR``: If this environment variable is set, it will be used as the base directory to store the data files. The directory should be an absolute path (i.e., it should start with ``/``). Otherwise, the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). - - ``XCLIM_DATA_UPDATES``: If this environment variable is set, then the data files will be downloaded even if the - upstream hashes do not match. This is useful if you want to always use the latest version of the data files. - ``XCLIM_TESTDATA_REPO_URL``: If this environment variable is set, it will be used as the URL of the repository to use when fetching datasets. Otherwise, the default repository will be used. - ``XCLIM_TESTDATA_BRANCH``: If this environment variable is set, it will be used as the branch of the repository @@ -302,22 +236,68 @@ def nimbus( # noqa: PR01 data = xr.open_dataset(example_file) """ remote = f"{repo}/raw/{branch}/data" - return pooch.create( path=data_dir, base_url=remote, version=default_testdata_version, version_dev=branch, allow_updates=data_updates, - registry=load_registry(repo=repo, branch=branch), + registry=load_registry(), ) +# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn +def open_dataset( + name: str | os.PathLike[str], + dap_url: str | None = None, + cache_dir: str | os.PathLike[str] = default_cache_dir, + **kwargs, +) -> Dataset: + r"""Open a dataset from the online GitHub-like repository. + + If a local copy is found then always use that to avoid network traffic. + + Parameters + ---------- + name : str + Name of the file containing the dataset. + dap_url : str, optional + URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. + cache_dir : Path + The directory in which to search for and write cached data. + \*\*kwargs + For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. + + Returns + ------- + Union[Dataset, Path] + + See Also + -------- + xarray.open_dataset + """ + if dap_url: + try: + return _open_dataset( + audit_url(urljoin(dap_url, str(name)), context="OPeNDAP"), **kwargs + ) + except (OSError, URLError): + msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" + logger.error(msg) + raise + + local_file = Path(cache_dir).joinpath(name) + try: + ds = _open_dataset(local_file, **kwargs) + return ds + except OSError as err: + raise err + + def populate_testing_data( - registry_file: str | Path | None = None, temp_folder: Path | None = None, - repo: str | None = None, - branch: str | None = None, + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, local_cache: Path = default_cache_dir, ) -> None: """Populate the local cache with the testing data. @@ -329,37 +309,21 @@ def populate_testing_data( repo : str, optional URL of the repository to use when fetching testing datasets. branch : str, optional - Branch of hydrologie/xhydro-testdata to use when fetching testing datasets. + Branch of Ouranosinc/xclim-testdata to use when fetching testing datasets. local_cache : Path - Path to the local cache. Defaults to the location set by the platformdirs library. + The path to the local cache. Defaults to the location set by the platformdirs library. The testing data will be downloaded to this local cache. Returns ------- None """ - if repo is None: - _repo = TESTDATA_REPO_URL - else: - _repo = repo - if branch is None: - _branch = TESTDATA_BRANCH - else: - _branch = branch - if temp_folder is not None: - _local_cache = temp_folder - else: - _local_cache = Path(local_cache) - # Create the Pooch instance - n = nimbus(data_dir=_local_cache, repo=_repo, branch=_branch) - - # Load the registry file - registry = load_registry(file=registry_file, repo=_repo, branch=_branch) + n = nimbus(data_dir=temp_folder or local_cache, repo=repo, branch=branch) # Download the files errored_files = [] - for file in registry.keys(): + for file in load_registry(): try: n.fetch(file) except HTTPError: @@ -375,30 +339,28 @@ def populate_testing_data( raise SocketBlockedError(msg) from e else: logging.info("Files were downloaded successfully.") - finally: - if errored_files: - logging.error( - "The following files were unable to be downloaded: %s", - errored_files, - ) + + if errored_files: + logging.error( + "The following files were unable to be downloaded: %s", + errored_files, + ) -def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: +def generate_atmos(cache_dir: str | os.PathLike[str] | Path) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" - with _open_dataset( + with open_dataset( "ERA5/daily_surface_cancities_1990-1993.nc", cache_dir=cache_dir, - branch=TESTDATA_BRANCH, engine="h5netcdf", ) as ds: + rsus = shortwave_upwelling_radiation_from_net_downwelling(ds.rss, ds.rsds) + rlus = longwave_upwelling_radiation_from_net_downwelling(ds.rls, ds.rlds) tn10 = calendar.percentile_doy(ds.tasmin, per=10) t10 = calendar.percentile_doy(ds.tas, per=10) t90 = calendar.percentile_doy(ds.tas, per=90) tx90 = calendar.percentile_doy(ds.tasmax, per=90) - rsus = shortwave_upwelling_radiation_from_net_downwelling(ds.rss, ds.rsds) - rlus = longwave_upwelling_radiation_from_net_downwelling(ds.rls, ds.rlds) - ds = ds.assign( rsus=rsus, rlus=rlus, @@ -413,18 +375,19 @@ def generate_atmos(cache_dir: Path) -> dict[str, xr.DataArray]: ds.to_netcdf(atmos_file, engine="h5netcdf") # Give access to dataset variables by name in namespace - namespace = dict() - with _open_dataset( - atmos_file, branch=TESTDATA_BRANCH, cache_dir=cache_dir, engine="h5netcdf" - ) as ds: - for variable in ds.data_vars: - namespace[f"{variable}_dataset"] = ds.get(variable) + with open_dataset(atmos_file, cache_dir=cache_dir, engine="h5netcdf") as ds: + namespace = {f"{var}_dataset": ds[var] for var in ds.data_vars} return namespace -def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): +def gather_testing_data( + threadsafe_data_dir: str | os.PathLike[str] | Path, worker_id: str +): """Gather testing data across workers.""" - if not default_cache_dir.exists() or PREFETCH_TESTING_DATA: + if ( + not default_cache_dir.joinpath(default_testdata_version).exists() + or PREFETCH_TESTING_DATA + ): if PREFETCH_TESTING_DATA: print("`XCLIM_PREFETCH_TESTING_DATA` set. Prefetching testing data...") if platform == "win32": @@ -432,7 +395,7 @@ def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): "UNIX-style file-locking is not supported on Windows. " "Consider running `$ xclim prefetch_testing_data` to download testing data." ) - elif worker_id in ["master"]: + elif worker_id == "master": populate_testing_data(branch=TESTDATA_BRANCH) else: default_cache_dir.mkdir(exist_ok=True, parents=True) @@ -445,29 +408,33 @@ def gather_testing_data(threadsafe_data_dir: Path, worker_id: str): with test_data_being_written.acquire(): if lockfile.exists(): lockfile.unlink() - copytree(default_cache_dir, threadsafe_data_dir) + copytree(default_cache_dir.joinpath(default_testdata_version), threadsafe_data_dir) def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: """Create a dictionary of relevant datasets to be patched into the xdoctest namespace.""" - namespace: dict = dict() - namespace["path_to_ensemble_file"] = "EnsembleReduce/TestEnsReduceCriteria.nc" - namespace["path_to_pr_file"] = "NRCANdaily/nrcan_canada_daily_pr_1990.nc" - namespace["path_to_sfcWind_file"] = "ERA5/daily_surface_cancities_1990-1993.nc" - namespace["path_to_tas_file"] = "ERA5/daily_surface_cancities_1990-1993.nc" - namespace["path_to_tasmax_file"] = "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc" - namespace["path_to_tasmin_file"] = "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc" + namespace = { + "path_to_ensemble_file": "EnsembleReduce/TestEnsReduceCriteria.nc", + "path_to_pr_file": "NRCANdaily/nrcan_canada_daily_pr_1990.nc", + "path_to_sfcWind_file": "ERA5/daily_surface_cancities_1990-1993.nc", + "path_to_tas_file": "ERA5/daily_surface_cancities_1990-1993.nc", + "path_to_tasmax_file": "NRCANdaily/nrcan_canada_daily_tasmax_1990.nc", + "path_to_tasmin_file": "NRCANdaily/nrcan_canada_daily_tasmin_1990.nc", + "path_to_example_py": ( + Path(__file__).parent.parent.parent.parent + / "docs" + / "notebooks" + / "example.py" + ), + } # For core.utils.load_module example - namespace["path_to_example_py"] = ( - Path(__file__).parent.parent.parent.parent / "docs" / "notebooks" / "example.py" - ) - time = xr.cftime_range("1990-01-01", "2049-12-31", freq="D") + sixty_years = xr.cftime_range("1990-01-01", "2049-12-31", freq="D") namespace["temperature_datasets"] = [ xr.DataArray( - 12 * np.random.random_sample(time.size) + 273, - coords={"time": time}, + 12 * np.random.random_sample(sixty_years.size) + 273, + coords={"time": sixty_years}, name="tas", dims=("time",), attrs={ @@ -477,8 +444,8 @@ def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: }, ), xr.DataArray( - 12 * np.random.random_sample(time.size) + 273, - coords={"time": time}, + 12 * np.random.random_sample(sixty_years.size) + 273, + coords={"time": sixty_years}, name="tas", dims=("time",), attrs={ @@ -551,3 +518,24 @@ def _raise_on_compute(dsk: dict): assert_lazy = Callback(start=_raise_on_compute) """Context manager that raises an AssertionError if any dask computation is triggered.""" + + +def audit_url(url: str, context: str | None = None) -> str: + """Check if the URL is well-formed. + + Raises + ------ + URLError + If the URL is not well-formed. + """ + msg = "" + result = urlparse(url) + if result.scheme == "http": + msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() + if not all([result.scheme, result.netloc]): + msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() + + if msg: + logger.error(msg) + raise URLError(msg) + return url diff --git a/xclim/testing/utils.py b/xclim/testing/utils.py index b396c4a99..6120582f3 100644 --- a/xclim/testing/utils.py +++ b/xclim/testing/utils.py @@ -16,18 +16,6 @@ from io import StringIO from pathlib import Path from typing import TextIO -from urllib.error import HTTPError, URLError -from urllib.parse import urljoin, urlparse -from urllib.request import urlretrieve - -import pooch -from xarray import Dataset -from xarray import open_dataset as _open_dataset - -try: - from pytest_socket import SocketBlockedError -except ImportError: - SocketBlockedError = None _xclim_deps = [ "xclim", @@ -51,152 +39,18 @@ "boltons", ] -default_cache_dir = Path(pooch.os_cache("xclim-testdata")) -"""Default location for the testing data cache.""" logger = logging.getLogger("xclim") + __all__ = [ - "audit_url", - "default_cache_dir", "list_input_variables", - "open_dataset", "publish_release_notes", "run_doctests", "show_versions", ] -def audit_url(url: str, context: str | None = None) -> str: - """Check if the URL is well-formed. - - Raises - ------ - URLError - If the URL is not well-formed. - """ - msg = "" - result = urlparse(url) - if result.scheme == "http": - msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() - if not all([result.scheme, result.netloc]): - msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() - - if msg: - logger.error(msg) - raise URLError(msg) - return url - - -def _get( - name: Path, - github_url: str, - branch: str, - cache_dir: Path, -) -> Path: - cache_dir = cache_dir.absolute() - local_file = cache_dir / branch / name - - if not github_url.startswith("https"): - raise ValueError(f"GitHub URL not secure: '{github_url}'.") - - if not local_file.is_file(): - # This will always leave this directory on disk. - # We may want to add an option to remove it. - local_file.parent.mkdir(exist_ok=True, parents=True) - url = "/".join((github_url, "raw", branch, "data", name.as_posix())) - msg = f"Fetching remote file: {name.as_posix()}" - logger.info(msg) - try: - urlretrieve(audit_url(url), local_file) # noqa: S310 - except HTTPError as e: - msg = ( - f"{name.as_posix()} not accessible in remote repository: {url}. " - "Aborting file retrieval." - ) - raise FileNotFoundError(msg) from e - except SocketBlockedError as e: - msg = ( - f"Unable to access {name.as_posix()} online. Testing suite is being run with `--disable-socket`. " - f"If you intend to run tests with this option enabled, please download the file beforehand with the " - f"following console command: `xclim prefetch_testing_data`." - ) - raise FileNotFoundError(msg) from e - - return local_file - - -# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn -def open_dataset( - name: str | os.PathLike[str], - dap_url: str | None = None, - github_url: str = "https://github.com/Ouranosinc/xclim-testdata", - branch: str = "main", - cache: bool = True, - cache_dir: Path = default_cache_dir, - **kwargs, -) -> Dataset: - r"""Open a dataset from the online GitHub-like repository. - - If a local copy is found then always use that to avoid network traffic. - - Parameters - ---------- - name : str or os.PathLike - Name of the file containing the dataset. - dap_url : str, optional - URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. - github_url : str - URL to GitHub repository where the data is stored. - branch : str, optional - For GitHub-hosted files, the branch to download from. - cache_dir : Path - The directory in which to search for and write cached data. - cache : bool - If True, then cache data locally for use on subsequent calls. - \*\*kwargs - For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. - - Returns - ------- - Union[Dataset, Path] - - See Also - -------- - xarray.open_dataset - """ - if isinstance(name, (str, os.PathLike)): - name = Path(name) - - if dap_url is not None: - dap_file_address = urljoin(dap_url, str(name)) - try: - ds = _open_dataset(audit_url(dap_file_address, context="OPeNDAP"), **kwargs) - return ds - except URLError: - raise - except OSError: - msg = f"OPeNDAP file not read. Verify that the service is available: '{dap_file_address}'" - logger.error(msg) - raise OSError(msg) - - local_file = _get( - name=name, - github_url=github_url, - branch=branch, - cache_dir=cache_dir, - ) - - try: - ds = _open_dataset(local_file, **kwargs) - if not cache: - ds = ds.load() - local_file.unlink() - return ds - except OSError as err: - raise err - - def list_input_variables( submodules: Sequence[str] | None = None, realms: Sequence[str] | None = None ) -> dict: From e298a49aca578f886251434313e487978495fed1 Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:48:26 -0400 Subject: [PATCH 04/21] temporarily use the `cleanup branch` --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index aa6a87364..7f096d85a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,7 @@ on: - submitted env: - XCLIM_TESTDATA_BRANCH: v2023.12.14 + XCLIM_TESTDATA_BRANCH: cleanup concurrency: # For a given workflow, if we push to the same branch, cancel all previous builds on that branch except on main. From c50d1d7b1db8cc5d4780906295465f7ed817123d Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 21 Aug 2024 14:43:58 -0400 Subject: [PATCH 05/21] adjustments --- xclim/testing/helpers.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 81d2ce31a..ca86de2ee 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -199,7 +199,6 @@ def nimbus( # noqa: PR01 ---------- data_dir : str or Path Path to the directory where the data files are stored. - repo : str URL of the repository to use when fetching testing datasets. branch : str @@ -287,6 +286,8 @@ def open_dataset( raise local_file = Path(cache_dir).joinpath(name) + if not local_file.exists(): + raise OSError(f"File not found: {local_file}") try: ds = _open_dataset(local_file, **kwargs) return ds @@ -384,19 +385,15 @@ def gather_testing_data( threadsafe_data_dir: str | os.PathLike[str] | Path, worker_id: str ): """Gather testing data across workers.""" - if ( - not default_cache_dir.joinpath(default_testdata_version).exists() - or PREFETCH_TESTING_DATA - ): - if PREFETCH_TESTING_DATA: - print("`XCLIM_PREFETCH_TESTING_DATA` set. Prefetching testing data...") + if worker_id == "master": + populate_testing_data(branch=TESTDATA_BRANCH) + else: if platform == "win32": - raise OSError( - "UNIX-style file-locking is not supported on Windows. " - "Consider running `$ xclim prefetch_testing_data` to download testing data." - ) - elif worker_id == "master": - populate_testing_data(branch=TESTDATA_BRANCH) + if not default_cache_dir.joinpath(default_testdata_version).exists(): + raise FileNotFoundError( + "Testing data not found and UNIX-style file-locking is not supported on Windows. " + "Consider running `$ xclim prefetch_testing_data` to download testing data beforehand." + ) else: default_cache_dir.mkdir(exist_ok=True, parents=True) lockfile = default_cache_dir.joinpath(".lock") @@ -408,7 +405,9 @@ def gather_testing_data( with test_data_being_written.acquire(): if lockfile.exists(): lockfile.unlink() - copytree(default_cache_dir.joinpath(default_testdata_version), threadsafe_data_dir) + copytree( + default_cache_dir.joinpath(default_testdata_version), threadsafe_data_dir + ) def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: From ea891f4b25bedef53e145a2dc81d5c6efe84c657 Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:25:48 -0400 Subject: [PATCH 06/21] simplify last failing tests --- tests/conftest.py | 4 ++-- tests/test_ensembles.py | 10 ++-------- tests/test_testing_utils.py | 10 ++++++++-- xclim/testing/helpers.py | 25 +++---------------------- 4 files changed, 15 insertions(+), 34 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e98fb784f..b6e266b5b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -305,9 +305,9 @@ def threadsafe_data_dir(tmp_path_factory): @pytest.fixture(scope="session") -def nimbus(threadsafe_data_dir): +def nimbus(threadsafe_data_dir, worker_id): return _nimbus( - data_dir=threadsafe_data_dir, + data_dir=default_cache_dir if worker_id == "master" else threadsafe_data_dir, repo=helpers.TESTDATA_REPO_URL, branch=helpers.TESTDATA_BRANCH, ) diff --git a/tests/test_ensembles.py b/tests/test_ensembles.py index 79033b441..896340634 100644 --- a/tests/test_ensembles.py +++ b/tests/test_ensembles.py @@ -26,7 +26,6 @@ from xclim import ensembles from xclim.indices.stats import get_dist -from xclim.testing.helpers import TESTDATA_BRANCH # sklearn's KMeans doesn't accept the standard numpy Generator, so we create a special fixture for these tests @@ -38,9 +37,7 @@ def random_state(): class TestEnsembleStats: - def test_create_ensemble( - self, open_dataset, ensemble_dataset_objects, threadsafe_data_dir - ): + def test_create_ensemble(self, open_dataset, ensemble_dataset_objects, nimbus): ds_all = [] for n in ensemble_dataset_objects["nc_files_simple"]: ds = open_dataset(n, decode_times=False) @@ -62,10 +59,7 @@ def test_create_ensemble( ens1 = ensembles.create_ensemble(ds_all, realizations=reals) # Kinda a hack? Alternative is to open and rewrite in a temp folder. - files = [ - threadsafe_data_dir / TESTDATA_BRANCH / "EnsembleStats" / Path(f).name - for f in ensemble_dataset_objects["nc_files_simple"] - ] + files = [nimbus.fetch(f) for f in ensemble_dataset_objects["nc_files_simple"]] ens2 = ensembles.create_ensemble(dict(zip(reals, files))) xr.testing.assert_identical(ens1, ens2) diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 35646f12e..730a52ea3 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -40,9 +40,15 @@ def file_md5_checksum(f_name): return hash_md5.hexdigest() @pytest.mark.requires_internet - def test_open_testdata(self): + def test_open_testdata( + self, + ): + from xclim.testing.helpers import default_cache_dir, default_testdata_version + ds = helpers.open_dataset( - Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712"), engine="h5netcdf" + Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc"), + cache_dir=default_cache_dir.joinpath(default_testdata_version), + engine="h5netcdf", ) assert ds.lon.size == 128 diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index ca86de2ee..3e2a46462 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -85,27 +85,6 @@ $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest """ -PREFETCH_TESTING_DATA = bool(os.getenv("XCLIM_PREFETCH_TESTING_DATA")) -"""Indicates whether the testing data should be downloaded when running tests. - -Notes ------ -When running tests multiple times, this flag allows developers to significantly speed up the pytest suite -by preventing sha256sum checks for all downloaded files. Proceed with caution. - -This can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_PREFETCH_TESTING_DATA=1 - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_PREFETCH_TESTING_DATA=1 pytest -""" - CACHE_DIR = os.getenv("XCLIM_DATA_DIR", default_cache_dir) """Sets the directory to store the testing datasets. @@ -128,11 +107,13 @@ __all__ = [ - "PREFETCH_TESTING_DATA", + "CACHE_DIR", "TESTDATA_BRANCH", + "TESTDATA_REPO_URL", "add_example_file_paths", "assert_lazy", "default_cache_dir", + "default_testdata_version", "generate_atmos", "nimbus", "open_dataset", From b667160dec30043a45b917d3a207d8596bcc1a9c Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:55:44 -0400 Subject: [PATCH 07/21] documentation fixes, cleanup, ensure that overrides are being passed properly --- tests/conftest.py | 26 +++-------- xclim/testing/helpers.py | 94 ++++++++++++++++++++++++++++++---------- 2 files changed, 78 insertions(+), 42 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b6e266b5b..ef002a1a8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,11 +24,6 @@ def random() -> np.random.Generator: return np.random.default_rng(seed=list(map(ord, "𝕽𝔞𝖓𝔡𝖔𝔪"))) -@pytest.fixture -def tmp_netcdf_filename(tmpdir): - return Path(tmpdir).joinpath("testfile.nc") - - @pytest.fixture def lat_series(): def _lat_series(values): @@ -367,27 +362,20 @@ def atmosds(nimbus) -> xr.Dataset: @pytest.fixture(scope="session") def ensemble_dataset_objects() -> dict[str, str]: - edo = dict() - edo["nc_files_simple"] = [ - "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", - "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc", - ] - edo["nc_files_extra"] = [ - "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc" - ] - edo["nc_files"] = edo["nc_files_simple"] + edo["nc_files_extra"] - return edo + return helpers.add_ensemble_dataset_objects() @pytest.fixture(autouse=True, scope="session") def gather_session_data(request, nimbus, worker_id): """Gather testing data on pytest run. - When running pytest with multiple workers, one worker will copy data remotely to _default_cache_dir while + When running pytest with multiple workers, one worker will copy data remotely to default cache dir while other workers wait using lockfile. Once the lock is released, all workers will then copy data to their local - threadsafe_data_dir.As this fixture is scoped to the session, it will only run once per pytest run. + threadsafe_data_dir. As this fixture is scoped to the session, it will only run once per pytest run. + + Due to the lack of UNIX sockets on Windows, the lockfile mechanism is not supported, requiring users on + Windows to run `$ xclim prefetch_testing_data` before running any tests for the first time to populate the + default cache dir. Additionally, this fixture is also used to generate the `atmosds` synthetic testing dataset. """ diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 3e2a46462..5b09015eb 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -12,12 +12,12 @@ from pathlib import Path from shutil import copytree from sys import platform +from typing import Any from urllib.error import HTTPError, URLError from urllib.parse import urljoin, urlparse import numpy as np import pandas as pd -import pooch import xarray as xr from dask.callbacks import Callback from filelock import FileLock @@ -30,6 +30,15 @@ except ImportError: SocketBlockedError = None +try: + import pooch +except ImportError: + warnings.warn( + "The `pooch` library is not installed. " + "The default cache directory for testing data will not be set." + ) + pooch = None + import xclim from xclim import __version__ as __xclim_version__ from xclim.core import calendar @@ -44,8 +53,11 @@ default_testdata_version = "v2023.12.14" """Default version of the testing data to use when fetching datasets.""" -default_cache_dir = Path(pooch.os_cache("xclim-testdata")) -"""Default location for the testing data cache.""" +try: + default_cache_dir = Path(pooch.os_cache("xclim-testdata")) + """Default location for the testing data cache.""" +except AttributeError: + default_cache_dir = None TESTDATA_REPO_URL = str( os.getenv("XCLIM_TESTDATA_REPO_URL", "https://github.com/Ouranosinc/xclim-testdata") @@ -67,7 +79,7 @@ $ env XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" pytest """ -TESTDATA_BRANCH = str(os.getenv("XCLIM_TESTDATA_BRANCH", "main")) +TESTDATA_BRANCH = str(os.getenv("XCLIM_TESTDATA_BRANCH", default_testdata_version)) """Sets the branch of the testing data repository to use when fetching datasets. Notes @@ -124,7 +136,10 @@ def testing_setup_warnings(): """Warn users about potential incompatibilities between xclim and xclim-testdata versions.""" - if re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) and TESTDATA_BRANCH != "main": + if ( + re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) + and TESTDATA_BRANCH != default_testdata_version + ): # This does not need to be emitted on GitHub Workflows and ReadTheDocs if not os.getenv("CI") and not os.getenv("READTHEDOCS"): warnings.warn( @@ -145,7 +160,8 @@ def testing_setup_warnings(): if Version(TESTDATA_BRANCH) > Version(install_calendar_version): warnings.warn( - f"The installation date of `xclim` ({install_date.ctime()}) predates the last release of testing data ({TESTDATA_BRANCH}). " + f"The installation date of `xclim` ({install_date.ctime()}) " + f"predates the last release of testing data ({TESTDATA_BRANCH}). " "It is very likely that the testing data is incompatible with this build of `xclim`.", ) @@ -173,7 +189,7 @@ def nimbus( # noqa: PR01 repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, data_updates: bool = True, -) -> pooch.Pooch: +): """Pooch registry instance for xclim test data. Parameters @@ -190,7 +206,7 @@ def nimbus( # noqa: PR01 Returns ------- pooch.Pooch - Pooch instance for the xclim test data. + The Pooch instance for accessing the xclim testing data. Notes ----- @@ -215,6 +231,12 @@ def nimbus( # noqa: PR01 example_file = nimbus().fetch("example.nc") data = xr.open_dataset(example_file) """ + if pooch is None: + raise ImportError( + "The `pooch` package is required to fetch the xclim testing data. " + "You can install it with `pip install pooch` or `pip install xclim[dev]`." + ) + remote = f"{repo}/raw/{branch}/data" return pooch.create( path=data_dir, @@ -230,7 +252,7 @@ def nimbus( # noqa: PR01 def open_dataset( name: str | os.PathLike[str], dap_url: str | None = None, - cache_dir: str | os.PathLike[str] = default_cache_dir, + cache_dir: str | os.PathLike[str] | None = CACHE_DIR, **kwargs, ) -> Dataset: r"""Open a dataset from the online GitHub-like repository. @@ -256,6 +278,12 @@ def open_dataset( -------- xarray.open_dataset """ + if cache_dir is None: + raise ValueError( + "The cache directory must be set. " + "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." + ) + if dap_url: try: return _open_dataset( @@ -280,7 +308,7 @@ def populate_testing_data( temp_folder: Path | None = None, repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, - local_cache: Path = default_cache_dir, + local_cache: Path = CACHE_DIR, ) -> None: """Populate the local cache with the testing data. @@ -291,7 +319,7 @@ def populate_testing_data( repo : str, optional URL of the repository to use when fetching testing datasets. branch : str, optional - Branch of Ouranosinc/xclim-testdata to use when fetching testing datasets. + Branch of xclim-testdata to use when fetching testing datasets. local_cache : Path The path to the local cache. Defaults to the location set by the platformdirs library. The testing data will be downloaded to this local cache. @@ -363,32 +391,55 @@ def generate_atmos(cache_dir: str | os.PathLike[str] | Path) -> dict[str, xr.Dat def gather_testing_data( - threadsafe_data_dir: str | os.PathLike[str] | Path, worker_id: str + threadsafe_data_dir: str | os.PathLike[str] | Path, + worker_id: str, + cache_dir: str | os.PathLike[str] | None = CACHE_DIR, ): """Gather testing data across workers.""" + if cache_dir is None: + raise ValueError( + "The cache directory must be set. " + "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." + ) + cache_dir = Path(cache_dir) + if worker_id == "master": populate_testing_data(branch=TESTDATA_BRANCH) else: if platform == "win32": - if not default_cache_dir.joinpath(default_testdata_version).exists(): + if not cache_dir.joinpath(default_testdata_version).exists(): raise FileNotFoundError( "Testing data not found and UNIX-style file-locking is not supported on Windows. " "Consider running `$ xclim prefetch_testing_data` to download testing data beforehand." ) else: - default_cache_dir.mkdir(exist_ok=True, parents=True) - lockfile = default_cache_dir.joinpath(".lock") + cache_dir.mkdir(exist_ok=True, parents=True) + lockfile = cache_dir.joinpath(".lock") test_data_being_written = FileLock(lockfile) with test_data_being_written: # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run populate_testing_data(branch=TESTDATA_BRANCH) - default_cache_dir.joinpath(".data_written").touch() + cache_dir.joinpath(".data_written").touch() with test_data_being_written.acquire(): if lockfile.exists(): lockfile.unlink() - copytree( - default_cache_dir.joinpath(default_testdata_version), threadsafe_data_dir - ) + copytree(cache_dir.joinpath(default_testdata_version), threadsafe_data_dir) + + +def add_ensemble_dataset_objects() -> dict[str, str]: + namespace = { + "nc_files_simple": [ + "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "EnsembleStats/BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", + "EnsembleStats/BCCAQv2+ANUSPLIN300_CCSM4_historical+rcp45_r2i1p1_1950-2100_tg_mean_YS.nc", + ], + "nc_files_extra": [ + "EnsembleStats/BCCAQv2+ANUSPLIN300_CNRM-CM5_historical+rcp45_r1i1p1_1970-2050_tg_mean_YS.nc" + ], + } + namespace["nc_files"] = namespace["nc_files_simple"] + namespace["nc_files_extra"] + return namespace def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: @@ -409,7 +460,6 @@ def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: } # For core.utils.load_module example - sixty_years = xr.cftime_range("1990-01-01", "2049-12-31", freq="D") namespace["temperature_datasets"] = [ xr.DataArray( @@ -435,11 +485,10 @@ def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: }, ), ] - return namespace -def add_doctest_filepaths(): +def add_doctest_filepaths() -> dict[str, Any]: """Add filepaths to the xdoctest namespace.""" namespace: dict = dict() namespace["np"] = np @@ -448,7 +497,6 @@ def add_doctest_filepaths(): np.random.rand(365) * 20 + 253.15, variable="tas" ) namespace["pr"] = test_timeseries(np.random.rand(365) * 5, variable="pr") - return namespace From 0ba3277c0bf32f9df0429393f00acefddc6d9288 Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:56:16 -0400 Subject: [PATCH 08/21] dependency updates --- environment.yml | 7 +++---- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index 070f86943..9a7c4ea35 100644 --- a/environment.yml +++ b/environment.yml @@ -28,9 +28,9 @@ dependencies: - flox - lmoments3 # Required for some Jupyter notebooks # Testing and development dependencies - - black ==24.4.2 + - black ==24.8.0 - blackdoc ==0.3.9 - - bump-my-version >=0.24.3 + - bump-my-version >=0.25.4 - cairosvg - codespell ==2.3.0 - coverage >=7.5.0 @@ -54,7 +54,6 @@ dependencies: - nc-time-axis >=1.4.1 - notebook - pandas-stubs >=2.2 - - platformdirs >=3.2 - pooch >=1.8.0 - pre-commit >=3.7 - pybtex >=0.24.0 @@ -74,7 +73,7 @@ dependencies: - tokenize-rt >=5.2.0 - tox >=4.16.0 # - tox-conda # Will be added when a tox@v4.0+ compatible plugin is released. - - vulture # ==2.11 # The conda-forge version is out of date. + - vulture ==2.11 - xdoctest >=1.1.5 - yamllint >=1.35.1 - pip >=24.0 diff --git a/pyproject.toml b/pyproject.toml index 628356fb5..d9ed30256 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,6 @@ dependencies = [ "packaging >=24.0", "pandas >=2.2", "pint >=0.18", - "pooch >=1.8.0", "pyarrow >=15.0.0", # Strongly encouraged for pandas v2.2.0+ "pyyaml >=6.0.1", "scikit-learn >=0.21.3", @@ -79,6 +78,7 @@ dev = [ "nbval >=0.11.0", "pandas-stubs >=2.2", "pip >=24.0", + "pooch >=1.8.0", "pre-commit >=3.7", "pylint >=3.2.4", "pytest >=8.0.0", From ceb5d0f1b7fc8754868e1543f19eb857325cb6b2 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:17:22 -0400 Subject: [PATCH 09/21] update CHANGELOG.rst --- CHANGELOG.rst | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b5930a36f..de439ac1b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,14 +2,29 @@ Changelog ========= -v0.53.0 +v0.53.0 (unreleased) -------------------- -Contributors to this version: Adrien Lamarche (:user:`LamAdr`). +Contributors to this version: Adrien Lamarche (:user:`LamAdr`), Trevor James Smith (:user:`Zeitsperre`). Bug fixes ^^^^^^^^^ * Fixed a small inefficiency in ``_otc_adjust`` (:pull:`1890`). +Breaking changes +^^^^^^^^^^^^^^^^ +* The ``xclim.testing`` module has been refactored to make use of `pooch` and many now-redundant functions have been removed: (:pull:`1889`) + * ``xclim.testing.utilities.open_dataset`` is now found under ``xclim.testing.helpers.open_dataset`` and uses a `pooch` instance to deliver locally-stored datasets. Its call signature has also changed. + * ``xclim.testing.utilities.get_file``, ``xclim.testing.utilities.get_local_testdata``, ``xclim.testing.utilities.list_datasets``, and ``xclim.testing.utilities.file_md5_checksum`` have been removed. + * ``xclim.testing.helpers.nimbus`` replaces much of this functionality. See the `xclim` documentation for more information. +* The `Ouranosinc/xclim-testdata` repository has been restructured for better organization and to make better use of `pooch` and data registries for testing data fetching. (:pull:`1889`). +* `platformdirs` is no longer a direct dependency of `xclim`, but `pooch` is required to use many of the new testing functions (installable via `pip install pooch` or `pip install 'xclim[dev]'`). (:pull:`1889`). + +Internal changes +^^^^^^^^^^^^^^^^ +* The testing data fetching mechanism has been completely rewritten to use `pooch` and file registries. (:pull:`1889`). +* Many tests focused on evaluating the normal operation of remote file access tools under ``xclim.testing`` have been removed. (:pull:`1889`). +* Setup and teardown functions that were found under ``tests/conftest.py`` have been optimized to reduce redundant calls when running ``pytest xclim``. Some obsolete `pytest` fixtures have also been removed.(:pull:`1889`). + v0.52.0 (2024-08-08) -------------------- Contributors to this version: David Huard (:user:`huard`), Trevor James Smith (:user:`Zeitsperre`), Hui-Min Wang (:user:`Hem-W`), Éric Dupuis (:user:`coxipi`), Sarah Gammon (:user:`SarahG-579462`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`), Adrien Lamarche (:user:`LamAdr`). From d014e495f75ccaf8a79c9dbf23215b5241c0d979 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:52:12 -0400 Subject: [PATCH 10/21] update prefetch cli and documentation, refactoring for consistency --- CONTRIBUTING.rst | 17 +++++++++-------- tests/conftest.py | 8 +++++--- tests/test_testing_utils.py | 7 +++++-- xclim/cli.py | 38 +++++++++++++++++++++++++++++++------ xclim/testing/helpers.py | 25 ++++++++++++------------ 5 files changed, 64 insertions(+), 31 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 1ecc47a54..df93ee2f3 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -269,9 +269,10 @@ Updating Testing Data If your code changes require changes to the testing data of `xclim` (i.e.: modifications to existing datasets or new datasets), these changes must be made via a Pull Request at the `xclim-testdata repository`_. -`xclim` allows for developers to test specific branches/versions of `xclim-testdata` via the `XCLIM_TESTDATA_BRANCH` environment variable, either through export, e.g.:: +`xclim` allows for developers to test specific branches/versions or forks of the `xclim-testdata` repository via the `XCLIM_TESTDATA_BRANCH` and `XCLIM_TESTDATA_REPO` environment variables, respectively, either through export, e.g.:: $ export XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" + $ export XCLIM_TESTDATA_REPO="https://github.com/my_username/xclim-testdata" $ pytest # or, alternatively: @@ -279,11 +280,11 @@ If your code changes require changes to the testing data of `xclim` (i.e.: modif or by setting the variable at runtime:: - $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" pytest + $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" XCLIM_TESTDATA_REPO="https://github.com/my_username/xclim-testdata" pytest # or, alternatively: - $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" tox + $ env XCLIM_TESTDATA_BRANCH="my_new_branch_of_testing_data" XCLIM_TESTDATA_REPO="https://github.com/my_username/xclim-testdata" tox -This will ensure that tests load the testing data from this branch before running. +This will ensure that tests load the appropriate testing data from this branch or repository before running. If you anticipate not having internet access, we suggest prefetching the testing data from `xclim-testdata repository`_ and storing it in your local cache. This can be done by running the following console command:: @@ -296,7 +297,7 @@ If your development branch relies on a specific branch of `Ouranosinc/xclim-test or, alternatively, with the `--branch` option:: - $ xclim prefetch_testing_data --branch my_new_branch_of_testing_data + $ xclim prefetch_testing_data --branch my_new_branch_of_testing_data --repo "https://github.com/my_username/xclim-testdata" If you wish to test a specific branch using GitHub CI, this can be set in `.github/workflows/main.yml`: @@ -306,7 +307,7 @@ If you wish to test a specific branch using GitHub CI, this can be set in `.gith XCLIM_TESTDATA_BRANCH: my_new_branch_of_testing_data .. warning:: - In order for a Pull Request to be allowed to merge to main development branch, this variable must match the latest tagged commit name on `xclim-testdata repository`_. + In order for a Pull Request to be allowed to merge to the `main` development branch, this variable must match the latest tagged commit name on `xclim-testdata repository`_. We suggest merging changed testing data first, tagging a new version of `xclim-testdata`, then re-running tests on your Pull Request at `Ouranosinc/xclim` with the newest tag. Running Tests in Offline Mode @@ -323,8 +324,8 @@ or, alternatively, using `tox` :: $ tox -e offline -These options will disable all network calls and skip tests marked with the `requires_internet` marker. -The `--allow-unix-socket` option is required to allow the `pytest-xdist`_ plugin to function properly. +These options will disable all network calls and skip tests marked with the ``requires_internet`` marker. +The ``--allow-unix-socket`` option is required to allow the `pytest-xdist`_ plugin to function properly. Tips ---- diff --git a/tests/conftest.py b/tests/conftest.py index ef002a1a8..ed7f1861d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,7 @@ from xclim.core import indicator from xclim.core.calendar import max_doy from xclim.testing import helpers -from xclim.testing.helpers import default_cache_dir # noqa +from xclim.testing.helpers import default_testdata_cache # noqa from xclim.testing.helpers import nimbus as _nimbus from xclim.testing.helpers import open_dataset as _open_dataset from xclim.testing.helpers import test_timeseries @@ -302,7 +302,9 @@ def threadsafe_data_dir(tmp_path_factory): @pytest.fixture(scope="session") def nimbus(threadsafe_data_dir, worker_id): return _nimbus( - data_dir=default_cache_dir if worker_id == "master" else threadsafe_data_dir, + data_dir=( + default_testdata_cache if worker_id == "master" else threadsafe_data_dir + ), repo=helpers.TESTDATA_REPO_URL, branch=helpers.TESTDATA_BRANCH, ) @@ -385,7 +387,7 @@ def gather_session_data(request, nimbus, worker_id): def remove_data_written_flag(): """Cleanup cache folder once we are finished.""" - flag = default_cache_dir.joinpath(".data_written") + flag = default_testdata_cache.joinpath(".data_written") if flag.exists(): flag.unlink() diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 730a52ea3..02585a69e 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -43,11 +43,14 @@ def file_md5_checksum(f_name): def test_open_testdata( self, ): - from xclim.testing.helpers import default_cache_dir, default_testdata_version + from xclim.testing.helpers import ( + default_testdata_cache, + default_testdata_version, + ) ds = helpers.open_dataset( Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc"), - cache_dir=default_cache_dir.joinpath(default_testdata_version), + cache_dir=default_testdata_cache.joinpath(default_testdata_version), engine="h5netcdf", ) assert ds.lon.size == 128 diff --git a/xclim/cli.py b/xclim/cli.py index 1df887b0d..f90a8c95a 100644 --- a/xclim/cli.py +++ b/xclim/cli.py @@ -18,7 +18,11 @@ from xclim.core.utils import InputKind from xclim.testing.helpers import ( TESTDATA_BRANCH, - default_cache_dir, + TESTDATA_CACHE, + TESTDATA_REPO_URL, + default_testdata_cache, + default_testdata_repo_url, + default_testdata_version, populate_testing_data, ) from xclim.testing.utils import publish_release_notes, show_versions @@ -155,25 +159,47 @@ def show_version_info(ctx): @click.command(short_help="Prefetch xclim testing data for development purposes.") +@click.option( + "-r", + "--repo", + help="The xclim-testdata repo to be fetched and cached. If not specified, defaults to " + f"`XCLIM_TESTDATA_REPO_URL` (if set) or `{default_testdata_repo_url}`.", +) @click.option( "-b", "--branch", help="The xclim-testdata branch to be fetched and cached. If not specified, defaults to " - "`XCLIM_TESTING_DATA_BRANCH` (if set) or `main`.", + f"`XCLIM_TESTDATA_BRANCH` (if set) or `{default_testdata_version}`.", +) +@click.option( + "-c", + "--cache-dir", + help="The xclim-testdata branch to be fetched and cached. If not specified, defaults to " + f"`XCLIM_TESTDATA_CACHE` (if set) or `{default_testdata_cache}`.", ) @click.pass_context -def prefetch_testing_data(ctx, branch): +def prefetch_testing_data(ctx, repo, branch, cache_dir): """Prefetch xclim testing data for development purposes.""" + if repo: + testdata_repo = repo + else: + testdata_repo = TESTDATA_REPO_URL if branch: testdata_branch = branch else: testdata_branch = TESTDATA_BRANCH + if cache_dir: + testdata_cache = cache_dir + else: + testdata_cache = TESTDATA_CACHE + click.echo(f"Gathering testing data from {testdata_repo}/{testdata_branch} ...") click.echo( - f"Gathering testing data from xclim-testdata `{testdata_branch}` branch..." + populate_testing_data( + repo=testdata_repo, branch=testdata_branch, local_cache=testdata_cache + ) ) - click.echo(populate_testing_data(branch=testdata_branch)) - click.echo(f"Testing data saved to `{default_cache_dir}`.") + click.echo(f"Testing data saved to `{testdata_cache}`.") ctx.exit() diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 5b09015eb..0fb6840fe 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -52,16 +52,16 @@ default_testdata_version = "v2023.12.14" """Default version of the testing data to use when fetching datasets.""" +default_testdata_repo_url = "https://github.com/Ouranosinc/xclim-testdata" +"""Default URL of the testing data repository to use when fetching datasets.""" try: - default_cache_dir = Path(pooch.os_cache("xclim-testdata")) + default_testdata_cache = Path(pooch.os_cache("xclim-testdata")) """Default location for the testing data cache.""" except AttributeError: - default_cache_dir = None + default_testdata_cache = None -TESTDATA_REPO_URL = str( - os.getenv("XCLIM_TESTDATA_REPO_URL", "https://github.com/Ouranosinc/xclim-testdata") -) +TESTDATA_REPO_URL = str(os.getenv("XCLIM_TESTDATA_REPO_URL", default_testdata_repo_url)) """Sets the URL of the testing data repository to use when fetching datasets. Notes @@ -97,7 +97,7 @@ $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest """ -CACHE_DIR = os.getenv("XCLIM_DATA_DIR", default_cache_dir) +TESTDATA_CACHE = os.getenv("XCLIM_TESTDATA_CACHE", default_testdata_cache) """Sets the directory to store the testing datasets. If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). @@ -119,12 +119,13 @@ __all__ = [ - "CACHE_DIR", "TESTDATA_BRANCH", + "TESTDATA_CACHE", "TESTDATA_REPO_URL", "add_example_file_paths", "assert_lazy", - "default_cache_dir", + "default_testdata_cache", + "default_testdata_repo_url", "default_testdata_version", "generate_atmos", "nimbus", @@ -185,7 +186,7 @@ def load_registry() -> dict[str, str]: def nimbus( # noqa: PR01 - data_dir: str | Path = CACHE_DIR, + data_dir: str | Path = TESTDATA_CACHE, repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, data_updates: bool = True, @@ -252,7 +253,7 @@ def nimbus( # noqa: PR01 def open_dataset( name: str | os.PathLike[str], dap_url: str | None = None, - cache_dir: str | os.PathLike[str] | None = CACHE_DIR, + cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE, **kwargs, ) -> Dataset: r"""Open a dataset from the online GitHub-like repository. @@ -308,7 +309,7 @@ def populate_testing_data( temp_folder: Path | None = None, repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, - local_cache: Path = CACHE_DIR, + local_cache: Path = TESTDATA_CACHE, ) -> None: """Populate the local cache with the testing data. @@ -393,7 +394,7 @@ def generate_atmos(cache_dir: str | os.PathLike[str] | Path) -> dict[str, xr.Dat def gather_testing_data( threadsafe_data_dir: str | os.PathLike[str] | Path, worker_id: str, - cache_dir: str | os.PathLike[str] | None = CACHE_DIR, + cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE, ): """Gather testing data across workers.""" if cache_dir is None: From 49a48bc128110f0269d7c527265945b781364072 Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Fri, 23 Aug 2024 10:58:03 -0400 Subject: [PATCH 11/21] audit url and allow for downloading custom registry files --- xclim/testing/helpers.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 0fb6840fe..8dd0127fc 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -15,6 +15,7 @@ from typing import Any from urllib.error import HTTPError, URLError from urllib.parse import urljoin, urlparse +from urllib.request import urlretrieve import numpy as np import pandas as pd @@ -167,7 +168,9 @@ def testing_setup_warnings(): ) -def load_registry() -> dict[str, str]: +def load_registry( + branch: str = TESTDATA_BRANCH, repo: str = TESTDATA_REPO_URL +) -> dict[str, str]: """Load the registry file for the test data. Returns @@ -175,6 +178,20 @@ def load_registry() -> dict[str, str]: dict Dictionary of filenames and hashes. """ + remote_registry = audit_url(f"{repo}/raw/{branch}/data/registry.txt") + + if branch is not default_testdata_version: + custom_registry_folder = Path( + str(ilr.files("xclim").joinpath(f"testing/{branch}")) + ) + custom_registry_folder.mkdir(parents=True, exist_ok=True) + registry_file = custom_registry_folder.joinpath("registry.txt") + urlretrieve(remote_registry, registry_file) # noqa: S310 + + elif repo is not default_testdata_repo_url: + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + urlretrieve(remote_registry, registry_file) # noqa: S310 + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) if not registry_file.exists(): raise FileNotFoundError(f"Registry file not found: {registry_file}") @@ -238,14 +255,14 @@ def nimbus( # noqa: PR01 "You can install it with `pip install pooch` or `pip install xclim[dev]`." ) - remote = f"{repo}/raw/{branch}/data" + remote = audit_url(f"{repo}/raw/{branch}/data") return pooch.create( path=data_dir, base_url=remote, version=default_testdata_version, version_dev=branch, allow_updates=data_updates, - registry=load_registry(), + registry=load_registry(branch=branch, repo=repo), ) From d2cfd10fb6ec28ed0a1f0058380b8cb6aceaa47c Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Fri, 23 Aug 2024 11:09:25 -0400 Subject: [PATCH 12/21] update the testdata tag, add some missing docstrings --- .github/workflows/main.yml | 2 +- xclim/testing/helpers.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7f096d85a..83e4c3099 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,7 @@ on: - submitted env: - XCLIM_TESTDATA_BRANCH: cleanup + XCLIM_TESTDATA_BRANCH: v2024.8.23 concurrency: # For a given workflow, if we push to the same branch, cancel all previous builds on that branch except on main. diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 8dd0127fc..5f1805272 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -51,8 +51,9 @@ logger = logging.getLogger("xclim") -default_testdata_version = "v2023.12.14" +default_testdata_version = "v2024.8.23" """Default version of the testing data to use when fetching datasets.""" + default_testdata_repo_url = "https://github.com/Ouranosinc/xclim-testdata" """Default URL of the testing data repository to use when fetching datasets.""" @@ -109,13 +110,13 @@ .. code-block:: console - $ export XCLIM_DATA_DIR="/path/to/my/data" + $ export XCLIM_TESTDATA_CACHE="/path/to/my/data" or setting the variable at runtime: .. code-block:: console - $ env XCLIM_DATA_DIR="/path/to/my/data" pytest + $ env XCLIM_TESTDATA_CACHE="/path/to/my/data" pytest """ @@ -123,6 +124,8 @@ "TESTDATA_BRANCH", "TESTDATA_CACHE", "TESTDATA_REPO_URL", + "add_doctest_filepaths", + "add_ensemble_dataset_objects", "add_example_file_paths", "assert_lazy", "default_testdata_cache", @@ -445,6 +448,7 @@ def gather_testing_data( def add_ensemble_dataset_objects() -> dict[str, str]: + """Create a dictionary of xclim ensemble-related datasets to be patched into the xdoctest namespace.""" namespace = { "nc_files_simple": [ "EnsembleStats/BCCAQv2+ANUSPLIN300_ACCESS1-0_historical+rcp45_r1i1p1_1950-2100_tg_mean_YS.nc", @@ -461,7 +465,7 @@ def add_ensemble_dataset_objects() -> dict[str, str]: def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: - """Create a dictionary of relevant datasets to be patched into the xdoctest namespace.""" + """Create a dictionary of doctest-relevant datasets to be patched into the xdoctest namespace.""" namespace = { "path_to_ensemble_file": "EnsembleReduce/TestEnsReduceCriteria.nc", "path_to_pr_file": "NRCANdaily/nrcan_canada_daily_pr_1990.nc", @@ -507,7 +511,7 @@ def add_example_file_paths() -> dict[str, str | list[xr.DataArray]]: def add_doctest_filepaths() -> dict[str, Any]: - """Add filepaths to the xdoctest namespace.""" + """Overload some libraries directly into the xdoctest namespace.""" namespace: dict = dict() namespace["np"] = np namespace["xclim"] = xclim From 42b6cee49f42c037f769cbfdbda3ccd05b18b02b Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Fri, 23 Aug 2024 13:22:04 -0400 Subject: [PATCH 13/21] update notebooks and add some remote access logic to xclim.testing.helpers.open_dataset --- docs/notebooks/analogs.ipynb | 6 +++--- docs/notebooks/extendxclim.ipynb | 4 ++-- docs/notebooks/sdba-advanced.ipynb | 9 +++++---- docs/notebooks/sdba.ipynb | 4 ++-- docs/notebooks/usage.ipynb | 11 +++-------- xclim/testing/helpers.py | 31 +++++++++++++++++++++++------- 6 files changed, 39 insertions(+), 26 deletions(-) diff --git a/docs/notebooks/analogs.ipynb b/docs/notebooks/analogs.ipynb index c60f6f28e..b8cc9612c 100644 --- a/docs/notebooks/analogs.ipynb +++ b/docs/notebooks/analogs.ipynb @@ -24,10 +24,10 @@ "from __future__ import annotations\n", "\n", "import matplotlib.pyplot as plt\n", + "from xarray.coding.calendar_ops import convert_calendar\n", "\n", "from xclim import analog\n", - "from xclim.core.calendar import convert_calendar\n", - "from xclim.testing import open_dataset" + "from xclim.testing.helpers import open_dataset" ] }, { @@ -258,7 +258,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/docs/notebooks/extendxclim.ipynb b/docs/notebooks/extendxclim.ipynb index aa2b679c9..a07a130ae 100644 --- a/docs/notebooks/extendxclim.ipynb +++ b/docs/notebooks/extendxclim.ipynb @@ -466,7 +466,7 @@ "metadata": {}, "outputs": [], "source": [ - "from xclim.testing import open_dataset\n", + "from xclim.testing.helpers import open_dataset\n", "\n", "ds = open_dataset(\"ERA5/daily_surface_cancities_1990-1993.nc\")\n", "with xr.set_options(keep_attrs=True):\n", @@ -599,7 +599,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.12.5" }, "toc": { "base_numbering": 1, diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index 97456b6bd..b7d2c4210 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -429,10 +429,11 @@ "metadata": {}, "outputs": [], "source": [ + "from xarray.coding.calendar_ops import convert_calendar\n", + "\n", "import xclim.sdba as sdba\n", - "from xclim.core.calendar import convert_calendar\n", "from xclim.core.units import convert_units_to\n", - "from xclim.testing import open_dataset\n", + "from xclim.testing.helpers import open_dataset\n", "\n", "group = sdba.Grouper(\"time.dayofyear\", window=31)\n", "\n", @@ -753,7 +754,7 @@ "\n", "import xclim as xc\n", "from xclim import sdba\n", - "from xclim.testing import open_dataset\n", + "from xclim.testing.helpers import open_dataset\n", "\n", "# load test data\n", "hist = open_dataset(\"sdba/CanESM2_1950-2100.nc\").sel(time=slice(\"1950\", \"1980\")).tasmax\n", @@ -880,7 +881,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.12.5" }, "toc": { "base_numbering": 1, diff --git a/docs/notebooks/sdba.ipynb b/docs/notebooks/sdba.ipynb index 345d5c446..45f4a53a3 100644 --- a/docs/notebooks/sdba.ipynb +++ b/docs/notebooks/sdba.ipynb @@ -457,7 +457,7 @@ "import numpy as np\n", "\n", "from xclim.core.units import convert_units_to\n", - "from xclim.testing import open_dataset\n", + "from xclim.testing.helpers import open_dataset\n", "\n", "dref = open_dataset(\"sdba/ahccd_1950-2013.nc\", drop_variables=[\"lat\", \"lon\"]).sel(\n", " time=slice(\"1981\", \"2010\")\n", @@ -808,7 +808,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.12.5" }, "toc": { "base_numbering": 1, diff --git a/docs/notebooks/usage.ipynb b/docs/notebooks/usage.ipynb index 90aad6b6b..15a535afb 100644 --- a/docs/notebooks/usage.ipynb +++ b/docs/notebooks/usage.ipynb @@ -26,7 +26,7 @@ "import xarray as xr\n", "\n", "import xclim.indices\n", - "from xclim import testing" + "from xclim.testing.helpers import open_dataset" ] }, { @@ -48,7 +48,7 @@ "# ds = xr.open_dataset(\"your_file.nc\")\n", "\n", "# For this example, let's use a test dataset from xclim:\n", - "ds = testing.open_dataset(\"ERA5/daily_surface_cancities_1990-1993.nc\")\n", + "ds = open_dataset(\"ERA5/daily_surface_cancities_1990-1993.nc\")\n", "ds.tas" ] }, @@ -164,11 +164,6 @@ "Resampling to a daily frequency and running the same indicator succeeds, but we will still get warnings from the CF metadata checks." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -387,7 +382,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 5f1805272..8c0c5a460 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -273,6 +273,8 @@ def nimbus( # noqa: PR01 def open_dataset( name: str | os.PathLike[str], dap_url: str | None = None, + branch: str = TESTDATA_BRANCH, + repo: str = TESTDATA_REPO_URL, cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE, **kwargs, ) -> Dataset: @@ -286,6 +288,10 @@ def open_dataset( Name of the file containing the dataset. dap_url : str, optional URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. + branch : str + Branch of the repository to use when fetching datasets. + repo: str + URL of the repository to use when fetching testing datasets. cache_dir : Path The directory in which to search for and write cached data. \*\*kwargs @@ -295,6 +301,11 @@ def open_dataset( ------- Union[Dataset, Path] + Raises + ------ + OSError + If the file is not found in the cache directory or cannot be read. + See Also -------- xarray.open_dataset @@ -310,19 +321,25 @@ def open_dataset( return _open_dataset( audit_url(urljoin(dap_url, str(name)), context="OPeNDAP"), **kwargs ) - except (OSError, URLError): - msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" - logger.error(msg) + except URLError: raise + except OSError as e: + msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" + raise OSError(msg) from e - local_file = Path(cache_dir).joinpath(name) + local_file = Path(cache_dir).joinpath(branch).joinpath(name) if not local_file.exists(): - raise OSError(f"File not found: {local_file}") + try: + local_file = nimbus(branch=branch, repo=repo).fetch(name) + except OSError as e: + raise OSError( + f"File not found locally. Verify that the testing data is available in remote: {local_file}" + ) from e try: ds = _open_dataset(local_file, **kwargs) return ds - except OSError as err: - raise err + except OSError: + raise def populate_testing_data( From 5d07cc6822404f4f9cb908238fec8ed46029456c Mon Sep 17 00:00:00 2001 From: Trevor James Smith <10819524+Zeitsperre@users.noreply.github.com> Date: Fri, 23 Aug 2024 14:32:20 -0400 Subject: [PATCH 14/21] adjustments --- tests/conftest.py | 4 ++-- xclim/testing/helpers.py | 26 ++++++++++++++++---------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index ed7f1861d..78cbc4a04 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -382,8 +382,8 @@ def gather_session_data(request, nimbus, worker_id): Additionally, this fixture is also used to generate the `atmosds` synthetic testing dataset. """ helpers.testing_setup_warnings() - helpers.gather_testing_data(nimbus.path, worker_id) - helpers.generate_atmos(nimbus.path) + helpers.gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) + helpers.generate_atmos(cache_dir=nimbus.path) def remove_data_written_flag(): """Cleanup cache folder once we are finished.""" diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 8c0c5a460..7e7b92443 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -183,7 +183,7 @@ def load_registry( """ remote_registry = audit_url(f"{repo}/raw/{branch}/data/registry.txt") - if branch is not default_testdata_version: + if branch != default_testdata_version: custom_registry_folder = Path( str(ilr.files("xclim").joinpath(f"testing/{branch}")) ) @@ -191,7 +191,7 @@ def load_registry( registry_file = custom_registry_folder.joinpath("registry.txt") urlretrieve(remote_registry, registry_file) # noqa: S310 - elif repo is not default_testdata_repo_url: + elif repo != default_testdata_repo_url: registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) urlretrieve(remote_registry, registry_file) # noqa: S310 @@ -327,7 +327,7 @@ def open_dataset( msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" raise OSError(msg) from e - local_file = Path(cache_dir).joinpath(branch).joinpath(name) + local_file = Path(cache_dir).joinpath(name) if not local_file.exists(): try: local_file = nimbus(branch=branch, repo=repo).fetch(name) @@ -395,10 +395,14 @@ def populate_testing_data( ) -def generate_atmos(cache_dir: str | os.PathLike[str] | Path) -> dict[str, xr.DataArray]: +def generate_atmos( + branch: str | os.PathLike[str] | Path = TESTDATA_BRANCH, + cache_dir: str | os.PathLike[str] | Path = TESTDATA_CACHE, +) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" with open_dataset( "ERA5/daily_surface_cancities_1990-1993.nc", + branch=branch, cache_dir=cache_dir, engine="h5netcdf", ) as ds: @@ -423,23 +427,25 @@ def generate_atmos(cache_dir: str | os.PathLike[str] | Path) -> dict[str, xr.Dat ds.to_netcdf(atmos_file, engine="h5netcdf") # Give access to dataset variables by name in namespace - with open_dataset(atmos_file, cache_dir=cache_dir, engine="h5netcdf") as ds: + with open_dataset( + atmos_file, branch=branch, cache_dir=cache_dir, engine="h5netcdf" + ) as ds: namespace = {f"{var}_dataset": ds[var] for var in ds.data_vars} return namespace def gather_testing_data( - threadsafe_data_dir: str | os.PathLike[str] | Path, + worker_cache_dir: str | os.PathLike[str] | Path, worker_id: str, - cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE, + _cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE, ): """Gather testing data across workers.""" - if cache_dir is None: + if _cache_dir is None: raise ValueError( "The cache directory must be set. " "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." ) - cache_dir = Path(cache_dir) + cache_dir = Path(_cache_dir) if worker_id == "master": populate_testing_data(branch=TESTDATA_BRANCH) @@ -461,7 +467,7 @@ def gather_testing_data( with test_data_being_written.acquire(): if lockfile.exists(): lockfile.unlink() - copytree(cache_dir.joinpath(default_testdata_version), threadsafe_data_dir) + copytree(cache_dir.joinpath(default_testdata_version), worker_cache_dir) def add_ensemble_dataset_objects() -> dict[str, str]: From fe97e300b2de1c81bb5ce79694dfcf686bd338c3 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 28 Aug 2024 12:58:08 -0400 Subject: [PATCH 15/21] fix doctests --- tests/conftest.py | 14 ++++++++++---- xclim/cli.py | 10 +++++----- xclim/testing/conftest.py | 39 ++++++++++++++++++++++++++------------- xclim/testing/helpers.py | 36 +++++++++++++++++++----------------- 4 files changed, 60 insertions(+), 39 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 78cbc4a04..de698e215 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -302,11 +302,11 @@ def threadsafe_data_dir(tmp_path_factory): @pytest.fixture(scope="session") def nimbus(threadsafe_data_dir, worker_id): return _nimbus( - data_dir=( - default_testdata_cache if worker_id == "master" else threadsafe_data_dir - ), repo=helpers.TESTDATA_REPO_URL, branch=helpers.TESTDATA_BRANCH, + cache_dir=( + helpers.TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir + ), ) @@ -315,7 +315,13 @@ def open_dataset(nimbus): def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): xr_kwargs.setdefault("cache", True) xr_kwargs.setdefault("engine", "h5netcdf") - return _open_dataset(file, cache_dir=nimbus.path, **xr_kwargs) + return _open_dataset( + file, + branch=helpers.TESTDATA_BRANCH, + repo=helpers.TESTDATA_REPO_URL, + cache_dir=nimbus.path, + **xr_kwargs, + ) return _open_session_scoped_file diff --git a/xclim/cli.py b/xclim/cli.py index f90a8c95a..d0cb2c96f 100644 --- a/xclim/cli.py +++ b/xclim/cli.py @@ -18,7 +18,7 @@ from xclim.core.utils import InputKind from xclim.testing.helpers import ( TESTDATA_BRANCH, - TESTDATA_CACHE, + TESTDATA_CACHE_DIR, TESTDATA_REPO_URL, default_testdata_cache, default_testdata_repo_url, @@ -189,17 +189,17 @@ def prefetch_testing_data(ctx, repo, branch, cache_dir): else: testdata_branch = TESTDATA_BRANCH if cache_dir: - testdata_cache = cache_dir + testdata_cache_dir = cache_dir else: - testdata_cache = TESTDATA_CACHE + testdata_cache_dir = TESTDATA_CACHE_DIR click.echo(f"Gathering testing data from {testdata_repo}/{testdata_branch} ...") click.echo( populate_testing_data( - repo=testdata_repo, branch=testdata_branch, local_cache=testdata_cache + repo=testdata_repo, branch=testdata_branch, local_cache=testdata_cache_dir ) ) - click.echo(f"Testing data saved to `{testdata_cache}`.") + click.echo(f"Testing data saved to `{testdata_cache_dir}`.") ctx.exit() diff --git a/xclim/testing/conftest.py b/xclim/testing/conftest.py index 7e175e975..337efbbc6 100644 --- a/xclim/testing/conftest.py +++ b/xclim/testing/conftest.py @@ -21,21 +21,36 @@ def threadsafe_data_dir(tmp_path_factory): @pytest.fixture(scope="session") -def open_dataset(threadsafe_data_dir): - """Return a function that opens a dataset from the test data directory.""" +def nimbus(threadsafe_data_dir, worker_id): + """Return a nimbus object for the test data.""" + return helpers.nimbus( + repo=helpers.TESTDATA_REPO_URL, + branch=helpers.TESTDATA_BRANCH, + cache_dir=( + helpers.TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir + ), + ) - def _open_session_scoped_file( - file: str | os.PathLike, branch: str = helpers.TESTDATA_BRANCH, **xr_kwargs - ): + +@pytest.fixture(scope="session") +def open_dataset(nimbus): + """Return a function that opens a dataset from the test data.""" + + def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): + xr_kwargs.setdefault("cache", True) xr_kwargs.setdefault("engine", "h5netcdf") return _open_dataset( - file, cache_dir=threadsafe_data_dir, branch=branch, **xr_kwargs + file, + branch=helpers.TESTDATA_BRANCH, + repo=helpers.TESTDATA_REPO_URL, + cache_dir=nimbus.path, + **xr_kwargs, ) return _open_session_scoped_file -@pytest.fixture(autouse=True, scope="session") +@pytest.fixture(scope="session", autouse=True) def is_matplotlib_installed(xdoctest_namespace) -> None: """Skip tests that require matplotlib if it is not installed.""" @@ -50,14 +65,12 @@ def _is_matplotlib_installed(): xdoctest_namespace["is_matplotlib_installed"] = _is_matplotlib_installed -@pytest.fixture(autouse=True, scope="session") -def doctest_setup( - xdoctest_namespace, threadsafe_data_dir, worker_id, open_dataset -) -> None: +@pytest.fixture(scope="session", autouse=True) +def doctest_setup(xdoctest_namespace, nimbus, worker_id, open_dataset) -> None: """Gather testing data on doctest run.""" helpers.testing_setup_warnings() - helpers.gather_testing_data(threadsafe_data_dir, worker_id) - xdoctest_namespace.update(helpers.generate_atmos(threadsafe_data_dir)) + helpers.gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) + xdoctest_namespace.update(helpers.generate_atmos(cache_dir=nimbus.path)) class AttrDict(dict): def __init__(self, *args, **kwargs): diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 7e7b92443..caf94374e 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -99,7 +99,7 @@ $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest """ -TESTDATA_CACHE = os.getenv("XCLIM_TESTDATA_CACHE", default_testdata_cache) +TESTDATA_CACHE_DIR = os.getenv("XCLIM_TESTDATA_CACHE_DIR", default_testdata_cache) """Sets the directory to store the testing datasets. If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). @@ -110,19 +110,19 @@ .. code-block:: console - $ export XCLIM_TESTDATA_CACHE="/path/to/my/data" + $ export XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" or setting the variable at runtime: .. code-block:: console - $ env XCLIM_TESTDATA_CACHE="/path/to/my/data" pytest + $ env XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" pytest """ __all__ = [ "TESTDATA_BRANCH", - "TESTDATA_CACHE", + "TESTDATA_CACHE_DIR", "TESTDATA_REPO_URL", "add_doctest_filepaths", "add_ensemble_dataset_objects", @@ -206,21 +206,21 @@ def load_registry( def nimbus( # noqa: PR01 - data_dir: str | Path = TESTDATA_CACHE, repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, + cache_dir: str | Path = TESTDATA_CACHE_DIR, data_updates: bool = True, ): """Pooch registry instance for xclim test data. Parameters ---------- - data_dir : str or Path - Path to the directory where the data files are stored. repo : str URL of the repository to use when fetching testing datasets. branch : str Branch of repository to use when fetching testing datasets. + cache_dir : str or Path + The path to the directory where the data files are stored. data_updates : bool If True, allow updates to the data files. Default is True. @@ -232,9 +232,9 @@ def nimbus( # noqa: PR01 Notes ----- There are three environment variables that can be used to control the behaviour of this registry: - - ``XCLIM_DATA_DIR``: If this environment variable is set, it will be used as the base directory to store the data - files. The directory should be an absolute path (i.e., it should start with ``/``). Otherwise, - the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). + - ``XCLIM_TESTDATA_CACHE_DIR``: If this environment variable is set, it will be used as the base directory to + store the data files. The directory should be an absolute path (i.e., it should start with ``/``). + Otherwise,the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). - ``XCLIM_TESTDATA_REPO_URL``: If this environment variable is set, it will be used as the URL of the repository to use when fetching datasets. Otherwise, the default repository will be used. - ``XCLIM_TESTDATA_BRANCH``: If this environment variable is set, it will be used as the branch of the repository @@ -260,7 +260,7 @@ def nimbus( # noqa: PR01 remote = audit_url(f"{repo}/raw/{branch}/data") return pooch.create( - path=data_dir, + path=cache_dir, base_url=remote, version=default_testdata_version, version_dev=branch, @@ -275,7 +275,7 @@ def open_dataset( dap_url: str | None = None, branch: str = TESTDATA_BRANCH, repo: str = TESTDATA_REPO_URL, - cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE, + cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, **kwargs, ) -> Dataset: r"""Open a dataset from the online GitHub-like repository. @@ -330,7 +330,9 @@ def open_dataset( local_file = Path(cache_dir).joinpath(name) if not local_file.exists(): try: - local_file = nimbus(branch=branch, repo=repo).fetch(name) + local_file = nimbus(branch=branch, repo=repo, cache_dir=cache_dir).fetch( + name + ) except OSError as e: raise OSError( f"File not found locally. Verify that the testing data is available in remote: {local_file}" @@ -346,7 +348,7 @@ def populate_testing_data( temp_folder: Path | None = None, repo: str = TESTDATA_REPO_URL, branch: str = TESTDATA_BRANCH, - local_cache: Path = TESTDATA_CACHE, + local_cache: Path = TESTDATA_CACHE_DIR, ) -> None: """Populate the local cache with the testing data. @@ -367,7 +369,7 @@ def populate_testing_data( None """ # Create the Pooch instance - n = nimbus(data_dir=temp_folder or local_cache, repo=repo, branch=branch) + n = nimbus(repo=repo, branch=branch, cache_dir=temp_folder or local_cache) # Download the files errored_files = [] @@ -397,7 +399,7 @@ def populate_testing_data( def generate_atmos( branch: str | os.PathLike[str] | Path = TESTDATA_BRANCH, - cache_dir: str | os.PathLike[str] | Path = TESTDATA_CACHE, + cache_dir: str | os.PathLike[str] | Path = TESTDATA_CACHE_DIR, ) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" with open_dataset( @@ -437,7 +439,7 @@ def generate_atmos( def gather_testing_data( worker_cache_dir: str | os.PathLike[str] | Path, worker_id: str, - _cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE, + _cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, ): """Gather testing data across workers.""" if _cache_dir is None: From 51f34ab192da0d2eadd9754f5218824d0416fdab Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:59:13 -0400 Subject: [PATCH 16/21] refactor API to reduce changes for users --- tests/conftest.py | 38 ++- tests/test_testing_utils.py | 20 +- xclim/cli.py | 5 +- xclim/testing/conftest.py | 40 ++- xclim/testing/helpers.py | 425 +----------------------------- xclim/testing/utils.py | 502 +++++++++++++++++++++++++++++++++--- 6 files changed, 533 insertions(+), 497 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index de698e215..dc2b25fc6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,11 +12,21 @@ from xclim.core import indicator from xclim.core.calendar import max_doy -from xclim.testing import helpers -from xclim.testing.helpers import default_testdata_cache # noqa -from xclim.testing.helpers import nimbus as _nimbus -from xclim.testing.helpers import open_dataset as _open_dataset -from xclim.testing.helpers import test_timeseries +from xclim.testing.helpers import ( + add_ensemble_dataset_objects, + generate_atmos, + test_timeseries, +) +from xclim.testing.utils import ( + TESTDATA_BRANCH, + TESTDATA_CACHE_DIR, + TESTDATA_REPO_URL, + default_testdata_cache, + gather_testing_data, +) +from xclim.testing.utils import nimbus as _nimbus +from xclim.testing.utils import open_dataset as _open_dataset +from xclim.testing.utils import testing_setup_warnings @pytest.fixture @@ -302,10 +312,10 @@ def threadsafe_data_dir(tmp_path_factory): @pytest.fixture(scope="session") def nimbus(threadsafe_data_dir, worker_id): return _nimbus( - repo=helpers.TESTDATA_REPO_URL, - branch=helpers.TESTDATA_BRANCH, + repo=TESTDATA_REPO_URL, + branch=TESTDATA_BRANCH, cache_dir=( - helpers.TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir + TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir ), ) @@ -317,8 +327,8 @@ def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): xr_kwargs.setdefault("engine", "h5netcdf") return _open_dataset( file, - branch=helpers.TESTDATA_BRANCH, - repo=helpers.TESTDATA_REPO_URL, + branch=TESTDATA_BRANCH, + repo=TESTDATA_REPO_URL, cache_dir=nimbus.path, **xr_kwargs, ) @@ -370,7 +380,7 @@ def atmosds(nimbus) -> xr.Dataset: @pytest.fixture(scope="session") def ensemble_dataset_objects() -> dict[str, str]: - return helpers.add_ensemble_dataset_objects() + return add_ensemble_dataset_objects() @pytest.fixture(autouse=True, scope="session") @@ -387,9 +397,9 @@ def gather_session_data(request, nimbus, worker_id): Additionally, this fixture is also used to generate the `atmosds` synthetic testing dataset. """ - helpers.testing_setup_warnings() - helpers.gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) - helpers.generate_atmos(cache_dir=nimbus.path) + testing_setup_warnings() + gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) + generate_atmos(branch=TESTDATA_BRANCH, cache_dir=nimbus.path) def remove_data_written_flag(): """Cleanup cache folder once we are finished.""" diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 02585a69e..0253b7881 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -1,7 +1,6 @@ from __future__ import annotations import platform -import sys from pathlib import Path import numpy as np @@ -9,9 +8,9 @@ from xarray import Dataset from xclim import __version__ as __xclim_version__ -from xclim.testing import helpers -from xclim.testing import utils as utilities from xclim.testing.helpers import test_timeseries as timeseries +from xclim.testing.utils import open_dataset as testing_open_dataset +from xclim.testing.utils import publish_release_notes, show_versions class TestFixtures: @@ -43,12 +42,9 @@ def file_md5_checksum(f_name): def test_open_testdata( self, ): - from xclim.testing.helpers import ( - default_testdata_cache, - default_testdata_version, - ) + from xclim.testing.utils import default_testdata_cache, default_testdata_version - ds = helpers.open_dataset( + ds = testing_open_dataset( Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc"), cache_dir=default_testdata_cache.joinpath(default_testdata_version), engine="h5netcdf", @@ -59,7 +55,7 @@ def test_md5_sum(self): test_data = Path(__file__).parent / "data" callendar = test_data / "callendar_1938.txt" md5_sum = self.file_md5_checksum(callendar) - if sys.platform == "win32": + if platform.system() == "Windows": # Windows has a different line ending (CR-LF) than Unix (LF) assert md5_sum == "38083271c2d4c85dea6bd6baf23d34de" # noqa else: @@ -69,7 +65,7 @@ def test_md5_sum(self): class TestReleaseSupportFuncs: def test_show_version_file(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") - utilities.show_versions(file=temp_filename) + show_versions(file=temp_filename) with open(temp_filename) as f: contents = f.readlines().copy() @@ -82,7 +78,7 @@ def test_show_version_file(self, tmp_path): @pytest.mark.requires_docs def test_release_notes_file(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") - utilities.publish_release_notes(style="md", file=temp_filename) + publish_release_notes(style="md", file=temp_filename) with open(temp_filename) as f: assert "# Changelog" in f.readlines()[0] @@ -91,4 +87,4 @@ def test_release_notes_file(self, tmp_path): def test_release_notes_file_not_implemented(self, tmp_path): temp_filename = tmp_path.joinpath("version_info.txt") with pytest.raises(NotImplementedError): - utilities.publish_release_notes(style="qq", file=temp_filename) + publish_release_notes(style="qq", file=temp_filename) diff --git a/xclim/cli.py b/xclim/cli.py index d0cb2c96f..a7fcc1174 100644 --- a/xclim/cli.py +++ b/xclim/cli.py @@ -16,7 +16,7 @@ import xclim as xc from xclim.core.dataflags import DataQualityException, data_flags, ecad_compliant from xclim.core.utils import InputKind -from xclim.testing.helpers import ( +from xclim.testing.utils import ( TESTDATA_BRANCH, TESTDATA_CACHE_DIR, TESTDATA_REPO_URL, @@ -24,8 +24,9 @@ default_testdata_repo_url, default_testdata_version, populate_testing_data, + publish_release_notes, + show_versions, ) -from xclim.testing.utils import publish_release_notes, show_versions distributed = False try: diff --git a/xclim/testing/conftest.py b/xclim/testing/conftest.py index 337efbbc6..095a22efd 100644 --- a/xclim/testing/conftest.py +++ b/xclim/testing/conftest.py @@ -10,8 +10,20 @@ import pytest -from xclim.testing import helpers -from xclim.testing.helpers import open_dataset as _open_dataset +from xclim.testing.helpers import ( + add_doctest_filepaths, + add_example_file_paths, + generate_atmos, +) +from xclim.testing.utils import ( + TESTDATA_BRANCH, + TESTDATA_CACHE_DIR, + TESTDATA_REPO_URL, + gather_testing_data, +) +from xclim.testing.utils import nimbus as _nimbus +from xclim.testing.utils import open_dataset as _open_dataset +from xclim.testing.utils import testing_setup_warnings @pytest.fixture(autouse=True, scope="session") @@ -23,11 +35,11 @@ def threadsafe_data_dir(tmp_path_factory): @pytest.fixture(scope="session") def nimbus(threadsafe_data_dir, worker_id): """Return a nimbus object for the test data.""" - return helpers.nimbus( - repo=helpers.TESTDATA_REPO_URL, - branch=helpers.TESTDATA_BRANCH, + return _nimbus( + repo=TESTDATA_REPO_URL, + branch=TESTDATA_BRANCH, cache_dir=( - helpers.TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir + TESTDATA_CACHE_DIR if worker_id == "master" else threadsafe_data_dir ), ) @@ -41,8 +53,8 @@ def _open_session_scoped_file(file: str | os.PathLike, **xr_kwargs): xr_kwargs.setdefault("engine", "h5netcdf") return _open_dataset( file, - branch=helpers.TESTDATA_BRANCH, - repo=helpers.TESTDATA_REPO_URL, + branch=TESTDATA_BRANCH, + repo=TESTDATA_REPO_URL, cache_dir=nimbus.path, **xr_kwargs, ) @@ -68,9 +80,11 @@ def _is_matplotlib_installed(): @pytest.fixture(scope="session", autouse=True) def doctest_setup(xdoctest_namespace, nimbus, worker_id, open_dataset) -> None: """Gather testing data on doctest run.""" - helpers.testing_setup_warnings() - helpers.gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) - xdoctest_namespace.update(helpers.generate_atmos(cache_dir=nimbus.path)) + testing_setup_warnings() + gather_testing_data(worker_cache_dir=nimbus.path, worker_id=worker_id) + xdoctest_namespace.update( + generate_atmos(branch=TESTDATA_BRANCH, cache_dir=nimbus.path) + ) class AttrDict(dict): def __init__(self, *args, **kwargs): @@ -80,5 +94,5 @@ def __init__(self, *args, **kwargs): xdoctest_namespace["open_dataset"] = open_dataset xdoctest_namespace["xr"] = AttrDict() xdoctest_namespace["xr"].update({"open_dataset": open_dataset}) - xdoctest_namespace.update(helpers.add_doctest_filepaths()) - xdoctest_namespace.update(helpers.add_example_file_paths()) + xdoctest_namespace.update(add_doctest_filepaths()) + xdoctest_namespace.update(add_example_file_paths()) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index caf94374e..e29af99ff 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -2,404 +2,42 @@ from __future__ import annotations -import importlib.resources as ilr import logging import os -import re -import time import warnings -from datetime import datetime as dt from pathlib import Path -from shutil import copytree -from sys import platform from typing import Any -from urllib.error import HTTPError, URLError -from urllib.parse import urljoin, urlparse -from urllib.request import urlretrieve import numpy as np import pandas as pd import xarray as xr from dask.callbacks import Callback -from filelock import FileLock -from packaging.version import Version -from xarray import Dataset -from xarray import open_dataset as _open_dataset - -try: - from pytest_socket import SocketBlockedError -except ImportError: - SocketBlockedError = None - -try: - import pooch -except ImportError: - warnings.warn( - "The `pooch` library is not installed. " - "The default cache directory for testing data will not be set." - ) - pooch = None import xclim -from xclim import __version__ as __xclim_version__ from xclim.core import calendar from xclim.core.utils import VARIABLES from xclim.indices import ( longwave_upwelling_radiation_from_net_downwelling, shortwave_upwelling_radiation_from_net_downwelling, ) +from xclim.testing.utils import open_dataset logger = logging.getLogger("xclim") -default_testdata_version = "v2024.8.23" -"""Default version of the testing data to use when fetching datasets.""" - -default_testdata_repo_url = "https://github.com/Ouranosinc/xclim-testdata" -"""Default URL of the testing data repository to use when fetching datasets.""" - -try: - default_testdata_cache = Path(pooch.os_cache("xclim-testdata")) - """Default location for the testing data cache.""" -except AttributeError: - default_testdata_cache = None - -TESTDATA_REPO_URL = str(os.getenv("XCLIM_TESTDATA_REPO_URL", default_testdata_repo_url)) -"""Sets the URL of the testing data repository to use when fetching datasets. - -Notes ------ -When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" pytest -""" - -TESTDATA_BRANCH = str(os.getenv("XCLIM_TESTDATA_BRANCH", default_testdata_version)) -"""Sets the branch of the testing data repository to use when fetching datasets. - -Notes ------ -When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_TESTDATA_BRANCH="my_testing_branch" - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest -""" - -TESTDATA_CACHE_DIR = os.getenv("XCLIM_TESTDATA_CACHE_DIR", default_testdata_cache) -"""Sets the directory to store the testing datasets. - -If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). - -Notes ------ -When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: - -.. code-block:: console - - $ export XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" - -or setting the variable at runtime: - -.. code-block:: console - - $ env XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" pytest -""" - __all__ = [ - "TESTDATA_BRANCH", - "TESTDATA_CACHE_DIR", - "TESTDATA_REPO_URL", "add_doctest_filepaths", "add_ensemble_dataset_objects", "add_example_file_paths", "assert_lazy", - "default_testdata_cache", - "default_testdata_repo_url", - "default_testdata_version", "generate_atmos", - "nimbus", - "open_dataset", - "populate_testing_data", "test_timeseries", ] -def testing_setup_warnings(): - """Warn users about potential incompatibilities between xclim and xclim-testdata versions.""" - if ( - re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) - and TESTDATA_BRANCH != default_testdata_version - ): - # This does not need to be emitted on GitHub Workflows and ReadTheDocs - if not os.getenv("CI") and not os.getenv("READTHEDOCS"): - warnings.warn( - f"`xclim` stable ({__xclim_version__}) is running tests against a non-default branch of the testing data. " - "It is possible that changes to the testing data may be incompatible with some assertions in this version. " - f"Please be sure to check {TESTDATA_REPO_URL} for more information.", - ) - - if re.match(r"^v\d+\.\d+\.\d+", TESTDATA_BRANCH): - # Find the date of last modification of xclim source files to generate a calendar version - install_date = dt.strptime( - time.ctime(os.path.getmtime(xclim.__file__)), - "%a %b %d %H:%M:%S %Y", - ) - install_calendar_version = ( - f"{install_date.year}.{install_date.month}.{install_date.day}" - ) - - if Version(TESTDATA_BRANCH) > Version(install_calendar_version): - warnings.warn( - f"The installation date of `xclim` ({install_date.ctime()}) " - f"predates the last release of testing data ({TESTDATA_BRANCH}). " - "It is very likely that the testing data is incompatible with this build of `xclim`.", - ) - - -def load_registry( - branch: str = TESTDATA_BRANCH, repo: str = TESTDATA_REPO_URL -) -> dict[str, str]: - """Load the registry file for the test data. - - Returns - ------- - dict - Dictionary of filenames and hashes. - """ - remote_registry = audit_url(f"{repo}/raw/{branch}/data/registry.txt") - - if branch != default_testdata_version: - custom_registry_folder = Path( - str(ilr.files("xclim").joinpath(f"testing/{branch}")) - ) - custom_registry_folder.mkdir(parents=True, exist_ok=True) - registry_file = custom_registry_folder.joinpath("registry.txt") - urlretrieve(remote_registry, registry_file) # noqa: S310 - - elif repo != default_testdata_repo_url: - registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) - urlretrieve(remote_registry, registry_file) # noqa: S310 - - registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) - if not registry_file.exists(): - raise FileNotFoundError(f"Registry file not found: {registry_file}") - - # Load the registry file - with registry_file.open() as f: - registry = {line.split()[0]: line.split()[1] for line in f} - return registry - - -def nimbus( # noqa: PR01 - repo: str = TESTDATA_REPO_URL, - branch: str = TESTDATA_BRANCH, - cache_dir: str | Path = TESTDATA_CACHE_DIR, - data_updates: bool = True, -): - """Pooch registry instance for xclim test data. - - Parameters - ---------- - repo : str - URL of the repository to use when fetching testing datasets. - branch : str - Branch of repository to use when fetching testing datasets. - cache_dir : str or Path - The path to the directory where the data files are stored. - data_updates : bool - If True, allow updates to the data files. Default is True. - - Returns - ------- - pooch.Pooch - The Pooch instance for accessing the xclim testing data. - - Notes - ----- - There are three environment variables that can be used to control the behaviour of this registry: - - ``XCLIM_TESTDATA_CACHE_DIR``: If this environment variable is set, it will be used as the base directory to - store the data files. The directory should be an absolute path (i.e., it should start with ``/``). - Otherwise,the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). - - ``XCLIM_TESTDATA_REPO_URL``: If this environment variable is set, it will be used as the URL of the repository - to use when fetching datasets. Otherwise, the default repository will be used. - - ``XCLIM_TESTDATA_BRANCH``: If this environment variable is set, it will be used as the branch of the repository - to use when fetching datasets. Otherwise, the default branch will be used. - - Examples - -------- - Using the registry to download a file: - - .. code-block:: python - - import xarray as xr - from xclim.testing.helpers import nimbus - - example_file = nimbus().fetch("example.nc") - data = xr.open_dataset(example_file) - """ - if pooch is None: - raise ImportError( - "The `pooch` package is required to fetch the xclim testing data. " - "You can install it with `pip install pooch` or `pip install xclim[dev]`." - ) - - remote = audit_url(f"{repo}/raw/{branch}/data") - return pooch.create( - path=cache_dir, - base_url=remote, - version=default_testdata_version, - version_dev=branch, - allow_updates=data_updates, - registry=load_registry(branch=branch, repo=repo), - ) - - -# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn -def open_dataset( - name: str | os.PathLike[str], - dap_url: str | None = None, - branch: str = TESTDATA_BRANCH, - repo: str = TESTDATA_REPO_URL, - cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, - **kwargs, -) -> Dataset: - r"""Open a dataset from the online GitHub-like repository. - - If a local copy is found then always use that to avoid network traffic. - - Parameters - ---------- - name : str - Name of the file containing the dataset. - dap_url : str, optional - URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. - branch : str - Branch of the repository to use when fetching datasets. - repo: str - URL of the repository to use when fetching testing datasets. - cache_dir : Path - The directory in which to search for and write cached data. - \*\*kwargs - For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. - - Returns - ------- - Union[Dataset, Path] - - Raises - ------ - OSError - If the file is not found in the cache directory or cannot be read. - - See Also - -------- - xarray.open_dataset - """ - if cache_dir is None: - raise ValueError( - "The cache directory must be set. " - "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." - ) - - if dap_url: - try: - return _open_dataset( - audit_url(urljoin(dap_url, str(name)), context="OPeNDAP"), **kwargs - ) - except URLError: - raise - except OSError as e: - msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" - raise OSError(msg) from e - - local_file = Path(cache_dir).joinpath(name) - if not local_file.exists(): - try: - local_file = nimbus(branch=branch, repo=repo, cache_dir=cache_dir).fetch( - name - ) - except OSError as e: - raise OSError( - f"File not found locally. Verify that the testing data is available in remote: {local_file}" - ) from e - try: - ds = _open_dataset(local_file, **kwargs) - return ds - except OSError: - raise - - -def populate_testing_data( - temp_folder: Path | None = None, - repo: str = TESTDATA_REPO_URL, - branch: str = TESTDATA_BRANCH, - local_cache: Path = TESTDATA_CACHE_DIR, -) -> None: - """Populate the local cache with the testing data. - - Parameters - ---------- - temp_folder : Path, optional - Path to a temporary folder to use as the local cache. If not provided, the default location will be used. - repo : str, optional - URL of the repository to use when fetching testing datasets. - branch : str, optional - Branch of xclim-testdata to use when fetching testing datasets. - local_cache : Path - The path to the local cache. Defaults to the location set by the platformdirs library. - The testing data will be downloaded to this local cache. - - Returns - ------- - None - """ - # Create the Pooch instance - n = nimbus(repo=repo, branch=branch, cache_dir=temp_folder or local_cache) - - # Download the files - errored_files = [] - for file in load_registry(): - try: - n.fetch(file) - except HTTPError: - msg = f"File `{file}` not accessible in remote repository." - logging.error(msg) - errored_files.append(file) - except SocketBlockedError as e: # noqa - msg = ( - "Unable to access registry file online. Testing suite is being run with `--disable-socket`. " - "If you intend to run tests with this option enabled, please download the file beforehand with the " - "following console command: `$ xclim prefetch_testing_data`." - ) - raise SocketBlockedError(msg) from e - else: - logging.info("Files were downloaded successfully.") - - if errored_files: - logging.error( - "The following files were unable to be downloaded: %s", - errored_files, - ) - - def generate_atmos( - branch: str | os.PathLike[str] | Path = TESTDATA_BRANCH, - cache_dir: str | os.PathLike[str] | Path = TESTDATA_CACHE_DIR, + branch: str | os.PathLike[str] | Path, + cache_dir: str | os.PathLike[str] | Path, ) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" with open_dataset( @@ -436,42 +74,6 @@ def generate_atmos( return namespace -def gather_testing_data( - worker_cache_dir: str | os.PathLike[str] | Path, - worker_id: str, - _cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, -): - """Gather testing data across workers.""" - if _cache_dir is None: - raise ValueError( - "The cache directory must be set. " - "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." - ) - cache_dir = Path(_cache_dir) - - if worker_id == "master": - populate_testing_data(branch=TESTDATA_BRANCH) - else: - if platform == "win32": - if not cache_dir.joinpath(default_testdata_version).exists(): - raise FileNotFoundError( - "Testing data not found and UNIX-style file-locking is not supported on Windows. " - "Consider running `$ xclim prefetch_testing_data` to download testing data beforehand." - ) - else: - cache_dir.mkdir(exist_ok=True, parents=True) - lockfile = cache_dir.joinpath(".lock") - test_data_being_written = FileLock(lockfile) - with test_data_being_written: - # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run - populate_testing_data(branch=TESTDATA_BRANCH) - cache_dir.joinpath(".data_written").touch() - with test_data_being_written.acquire(): - if lockfile.exists(): - lockfile.unlink() - copytree(cache_dir.joinpath(default_testdata_version), worker_cache_dir) - - def add_ensemble_dataset_objects() -> dict[str, str]: """Create a dictionary of xclim ensemble-related datasets to be patched into the xdoctest namespace.""" namespace = { @@ -593,24 +195,3 @@ def _raise_on_compute(dsk: dict): assert_lazy = Callback(start=_raise_on_compute) """Context manager that raises an AssertionError if any dask computation is triggered.""" - - -def audit_url(url: str, context: str | None = None) -> str: - """Check if the URL is well-formed. - - Raises - ------ - URLError - If the URL is not well-formed. - """ - msg = "" - result = urlparse(url) - if result.scheme == "http": - msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() - if not all([result.scheme, result.netloc]): - msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() - - if msg: - logger.error(msg) - raise URLError(msg) - return url diff --git a/xclim/testing/utils.py b/xclim/testing/utils.py index 6120582f3..13fa86563 100644 --- a/xclim/testing/utils.py +++ b/xclim/testing/utils.py @@ -3,53 +3,140 @@ ====================================== """ -# Some of this code was copied and adapted from xarray from __future__ import annotations +import importlib.resources as ilr import logging import os import platform import re import sys +import time +import warnings from collections.abc import Sequence +from datetime import datetime as dt from importlib import import_module from io import StringIO from pathlib import Path +from shutil import copytree from typing import TextIO +from urllib.error import HTTPError, URLError +from urllib.parse import urljoin, urlparse +from urllib.request import urlretrieve -_xclim_deps = [ - "xclim", - "xarray", - "statsmodels", - "sklearn", - "scipy", - "pint", - "pandas", - "numpy", - "numba", - "lmoments3", - "jsonpickle", - "flox", - "dask", - "cf_xarray", - "cftime", - "clisops", - "click", - "bottleneck", - "boltons", -] +from filelock import FileLock +from packaging.version import Version +from xarray import Dataset +from xarray import open_dataset as _open_dataset + +import xclim +from xclim import __version__ as __xclim_version__ + +try: + from pytest_socket import SocketBlockedError +except ImportError: + SocketBlockedError = None + +try: + import pooch +except ImportError: + warnings.warn( + "The `pooch` library is not installed. " + "The default cache directory for testing data will not be set." + ) + pooch = None logger = logging.getLogger("xclim") __all__ = [ + "TESTDATA_BRANCH", + "TESTDATA_CACHE_DIR", + "TESTDATA_REPO_URL", + "audit_url", + "default_testdata_cache", + "default_testdata_repo_url", + "default_testdata_version", + "gather_testing_data", "list_input_variables", + "nimbus", + "open_dataset", + "populate_testing_data", "publish_release_notes", "run_doctests", "show_versions", + "testing_setup_warnings", ] +default_testdata_version = "v2024.8.23" +"""Default version of the testing data to use when fetching datasets.""" + +default_testdata_repo_url = "https://github.com/Ouranosinc/xclim-testdata" +"""Default URL of the testing data repository to use when fetching datasets.""" + +try: + default_testdata_cache = Path(pooch.os_cache("xclim-testdata")) + """Default location for the testing data cache.""" +except AttributeError: + default_testdata_cache = None + +TESTDATA_REPO_URL = str(os.getenv("XCLIM_TESTDATA_REPO_URL", default_testdata_repo_url)) +"""Sets the URL of the testing data repository to use when fetching datasets. + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" + +or setting the variable at runtime: + +.. code-block:: console + + $ env XCLIM_TESTDATA_REPO_URL="https://github.com/my_username/xclim-testdata" pytest +""" + +TESTDATA_BRANCH = str(os.getenv("XCLIM_TESTDATA_BRANCH", default_testdata_version)) +"""Sets the branch of the testing data repository to use when fetching datasets. + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XCLIM_TESTDATA_BRANCH="my_testing_branch" + +or setting the variable at runtime: + +.. code-block:: console + + $ env XCLIM_TESTDATA_BRANCH="my_testing_branch" pytest +""" + +TESTDATA_CACHE_DIR = os.getenv("XCLIM_TESTDATA_CACHE_DIR", default_testdata_cache) +"""Sets the directory to store the testing datasets. + +If not set, the default location will be used (based on ``platformdirs``, see :func:`pooch.os_cache`). + +Notes +----- +When running tests locally, this can be set for both `pytest` and `tox` by exporting the variable: + +.. code-block:: console + + $ export XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" + +or setting the variable at runtime: + +.. code-block:: console + + $ env XCLIM_TESTDATA_CACHE_DIR="/path/to/my/data" pytest +""" + def list_input_variables( submodules: Sequence[str] | None = None, realms: Sequence[str] | None = None @@ -106,18 +193,7 @@ def list_input_variables( return variables -def run_doctests(): - """Run the doctests for the module.""" - import pytest - - cmd = [ - f"--rootdir={Path(__file__).absolute().parent}", - "--numprocesses=0", - "--xdoctest", - f"{Path(__file__).absolute().parents[1]}", - ] - - sys.exit(pytest.main(cmd)) +# Publishing Tools ### def publish_release_notes( @@ -205,6 +281,29 @@ def publish_release_notes( return None +_xclim_deps = [ + "xclim", + "xarray", + "statsmodels", + "sklearn", + "scipy", + "pint", + "pandas", + "numpy", + "numba", + "lmoments3", + "jsonpickle", + "flox", + "dask", + "cf_xarray", + "cftime", + "clisops", + "click", + "bottleneck", + "boltons", +] + + def show_versions( file: os.PathLike | StringIO | TextIO | None = None, deps: list[str] | None = None, @@ -266,3 +365,338 @@ def show_versions( else: print(message, file=file) return None + + +# Test Data Utilities ### + + +def run_doctests(): + """Run the doctests for the module.""" + import pytest + + cmd = [ + f"--rootdir={Path(__file__).absolute().parent}", + "--numprocesses=0", + "--xdoctest", + f"{Path(__file__).absolute().parents[1]}", + ] + + sys.exit(pytest.main(cmd)) + + +def testing_setup_warnings(): + """Warn users about potential incompatibilities between xclim and xclim-testdata versions.""" + if ( + re.match(r"^\d+\.\d+\.\d+$", __xclim_version__) + and TESTDATA_BRANCH != default_testdata_version + ): + # This does not need to be emitted on GitHub Workflows and ReadTheDocs + if not os.getenv("CI") and not os.getenv("READTHEDOCS"): + warnings.warn( + f"`xclim` stable ({__xclim_version__}) is running tests against a non-default branch of the testing data. " + "It is possible that changes to the testing data may be incompatible with some assertions in this version. " + f"Please be sure to check {TESTDATA_REPO_URL} for more information.", + ) + + if re.match(r"^v\d+\.\d+\.\d+", TESTDATA_BRANCH): + # Find the date of last modification of xclim source files to generate a calendar version + install_date = dt.strptime( + time.ctime(os.path.getmtime(xclim.__file__)), + "%a %b %d %H:%M:%S %Y", + ) + install_calendar_version = ( + f"{install_date.year}.{install_date.month}.{install_date.day}" + ) + + if Version(TESTDATA_BRANCH) > Version(install_calendar_version): + warnings.warn( + f"The installation date of `xclim` ({install_date.ctime()}) " + f"predates the last release of testing data ({TESTDATA_BRANCH}). " + "It is very likely that the testing data is incompatible with this build of `xclim`.", + ) + + +def load_registry( + branch: str = TESTDATA_BRANCH, repo: str = TESTDATA_REPO_URL +) -> dict[str, str]: + """Load the registry file for the test data. + + Returns + ------- + dict + Dictionary of filenames and hashes. + """ + remote_registry = audit_url(f"{repo}/raw/{branch}/data/registry.txt") + + if branch != default_testdata_version: + custom_registry_folder = Path( + str(ilr.files("xclim").joinpath(f"testing/{branch}")) + ) + custom_registry_folder.mkdir(parents=True, exist_ok=True) + registry_file = custom_registry_folder.joinpath("registry.txt") + urlretrieve(remote_registry, registry_file) # noqa: S310 + + elif repo != default_testdata_repo_url: + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + urlretrieve(remote_registry, registry_file) # noqa: S310 + + registry_file = Path(str(ilr.files("xclim").joinpath("testing/registry.txt"))) + if not registry_file.exists(): + raise FileNotFoundError(f"Registry file not found: {registry_file}") + + # Load the registry file + with registry_file.open() as f: + registry = {line.split()[0]: line.split()[1] for line in f} + return registry + + +def nimbus( # noqa: PR01 + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, + cache_dir: str | Path = TESTDATA_CACHE_DIR, + data_updates: bool = True, +): + """Pooch registry instance for xclim test data. + + Parameters + ---------- + repo : str + URL of the repository to use when fetching testing datasets. + branch : str + Branch of repository to use when fetching testing datasets. + cache_dir : str or Path + The path to the directory where the data files are stored. + data_updates : bool + If True, allow updates to the data files. Default is True. + + Returns + ------- + pooch.Pooch + The Pooch instance for accessing the xclim testing data. + + Notes + ----- + There are three environment variables that can be used to control the behaviour of this registry: + - ``XCLIM_TESTDATA_CACHE_DIR``: If this environment variable is set, it will be used as the base directory to + store the data files. The directory should be an absolute path (i.e., it should start with ``/``). + Otherwise,the default location will be used (based on ``platformdirs``, see :py:func:`pooch.os_cache`). + - ``XCLIM_TESTDATA_REPO_URL``: If this environment variable is set, it will be used as the URL of the repository + to use when fetching datasets. Otherwise, the default repository will be used. + - ``XCLIM_TESTDATA_BRANCH``: If this environment variable is set, it will be used as the branch of the repository + to use when fetching datasets. Otherwise, the default branch will be used. + + Examples + -------- + Using the registry to download a file: + + .. code-block:: python + + import xarray as xr + from xclim.testing.helpers import nimbus + + example_file = nimbus().fetch("example.nc") + data = xr.open_dataset(example_file) + """ + if pooch is None: + raise ImportError( + "The `pooch` package is required to fetch the xclim testing data. " + "You can install it with `pip install pooch` or `pip install xclim[dev]`." + ) + + remote = audit_url(f"{repo}/raw/{branch}/data") + return pooch.create( + path=cache_dir, + base_url=remote, + version=default_testdata_version, + version_dev=branch, + allow_updates=data_updates, + registry=load_registry(branch=branch, repo=repo), + ) + + +# idea copied from raven that it borrowed from xclim that borrowed it from xarray that was borrowed from Seaborn +def open_dataset( + name: str | os.PathLike[str], + dap_url: str | None = None, + branch: str = TESTDATA_BRANCH, + repo: str = TESTDATA_REPO_URL, + cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, + **kwargs, +) -> Dataset: + r"""Open a dataset from the online GitHub-like repository. + + If a local copy is found then always use that to avoid network traffic. + + Parameters + ---------- + name : str + Name of the file containing the dataset. + dap_url : str, optional + URL to OPeNDAP folder where the data is stored. If supplied, supersedes github_url. + branch : str + Branch of the repository to use when fetching datasets. + repo: str + URL of the repository to use when fetching testing datasets. + cache_dir : Path + The directory in which to search for and write cached data. + \*\*kwargs + For NetCDF files, keywords passed to :py:func:`xarray.open_dataset`. + + Returns + ------- + Union[Dataset, Path] + + Raises + ------ + OSError + If the file is not found in the cache directory or cannot be read. + + See Also + -------- + xarray.open_dataset + """ + if cache_dir is None: + raise ValueError( + "The cache directory must be set. " + "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." + ) + + if dap_url: + try: + return _open_dataset( + audit_url(urljoin(dap_url, str(name)), context="OPeNDAP"), **kwargs + ) + except URLError: + raise + except OSError as e: + msg = f"OPeNDAP file not read. Verify that the service is available: '{urljoin(dap_url, str(name))}'" + raise OSError(msg) from e + + local_file = Path(cache_dir).joinpath(name) + if not local_file.exists(): + try: + local_file = nimbus(branch=branch, repo=repo, cache_dir=cache_dir).fetch( + name + ) + except OSError as e: + raise OSError( + f"File not found locally. Verify that the testing data is available in remote: {local_file}" + ) from e + try: + ds = _open_dataset(local_file, **kwargs) + return ds + except OSError: + raise + + +def populate_testing_data( + temp_folder: Path | None = None, + repo: str = TESTDATA_REPO_URL, + branch: str = TESTDATA_BRANCH, + local_cache: Path = TESTDATA_CACHE_DIR, +) -> None: + """Populate the local cache with the testing data. + + Parameters + ---------- + temp_folder : Path, optional + Path to a temporary folder to use as the local cache. If not provided, the default location will be used. + repo : str, optional + URL of the repository to use when fetching testing datasets. + branch : str, optional + Branch of xclim-testdata to use when fetching testing datasets. + local_cache : Path + The path to the local cache. Defaults to the location set by the platformdirs library. + The testing data will be downloaded to this local cache. + + Returns + ------- + None + """ + # Create the Pooch instance + n = nimbus(repo=repo, branch=branch, cache_dir=temp_folder or local_cache) + + # Download the files + errored_files = [] + for file in load_registry(): + try: + n.fetch(file) + except HTTPError: + msg = f"File `{file}` not accessible in remote repository." + logging.error(msg) + errored_files.append(file) + except SocketBlockedError as e: # noqa + msg = ( + "Unable to access registry file online. Testing suite is being run with `--disable-socket`. " + "If you intend to run tests with this option enabled, please download the file beforehand with the " + "following console command: `$ xclim prefetch_testing_data`." + ) + raise SocketBlockedError(msg) from e + else: + logging.info("Files were downloaded successfully.") + + if errored_files: + logging.error( + "The following files were unable to be downloaded: %s", + errored_files, + ) + + +def gather_testing_data( + worker_cache_dir: str | os.PathLike[str] | Path, + worker_id: str, + _cache_dir: str | os.PathLike[str] | None = TESTDATA_CACHE_DIR, +): + """Gather testing data across workers.""" + if _cache_dir is None: + raise ValueError( + "The cache directory must be set. " + "Please set the `cache_dir` parameter or the `XCLIM_DATA_DIR` environment variable." + ) + cache_dir = Path(_cache_dir) + + if worker_id == "master": + populate_testing_data(branch=TESTDATA_BRANCH) + else: + if platform.system() == "Windows": + if not cache_dir.joinpath(default_testdata_version).exists(): + raise FileNotFoundError( + "Testing data not found and UNIX-style file-locking is not supported on Windows. " + "Consider running `$ xclim prefetch_testing_data` to download testing data beforehand." + ) + else: + cache_dir.mkdir(exist_ok=True, parents=True) + lockfile = cache_dir.joinpath(".lock") + test_data_being_written = FileLock(lockfile) + with test_data_being_written: + # This flag prevents multiple calls from re-attempting to download testing data in the same pytest run + populate_testing_data(branch=TESTDATA_BRANCH) + cache_dir.joinpath(".data_written").touch() + with test_data_being_written.acquire(): + if lockfile.exists(): + lockfile.unlink() + copytree(cache_dir.joinpath(default_testdata_version), worker_cache_dir) + + +# Testing Utilities ### + + +def audit_url(url: str, context: str | None = None) -> str: + """Check if the URL is well-formed. + + Raises + ------ + URLError + If the URL is not well-formed. + """ + msg = "" + result = urlparse(url) + if result.scheme == "http": + msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip() + if not all([result.scheme, result.netloc]): + msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip() + + if msg: + logger.error(msg) + raise URLError(msg) + return url From 202c454003e196be75f865cdf2fb4142aa3b1ea4 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:11:24 -0400 Subject: [PATCH 17/21] fix testing import --- tests/test_testing_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_testing_utils.py b/tests/test_testing_utils.py index 0253b7881..6fb5d2dd1 100644 --- a/tests/test_testing_utils.py +++ b/tests/test_testing_utils.py @@ -9,8 +9,7 @@ from xclim import __version__ as __xclim_version__ from xclim.testing.helpers import test_timeseries as timeseries -from xclim.testing.utils import open_dataset as testing_open_dataset -from xclim.testing.utils import publish_release_notes, show_versions +from xclim.testing.utils import open_dataset, publish_release_notes, show_versions class TestFixtures: @@ -44,7 +43,8 @@ def test_open_testdata( ): from xclim.testing.utils import default_testdata_cache, default_testdata_version - ds = testing_open_dataset( + # Test with top-level default engine + ds = open_dataset( Path("cmip5/tas_Amon_CanESM2_rcp85_r1i1p1_200701-200712.nc"), cache_dir=default_testdata_cache.joinpath(default_testdata_version), engine="h5netcdf", From f652b751ba902c81838f07fd0d7970b7206abf1d Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:32:09 -0400 Subject: [PATCH 18/21] fix imports in notebooks, reduce possibility of importing wrong open_dataset --- docs/notebooks/analogs.ipynb | 4 ++-- docs/notebooks/cli.ipynb | 2 -- docs/notebooks/customize.ipynb | 3 +-- docs/notebooks/ensembles.ipynb | 6 ++---- docs/notebooks/sdba-advanced.ipynb | 1 - docs/notebooks/units.ipynb | 1 - docs/notebooks/usage.ipynb | 2 +- xclim/testing/helpers.py | 6 +++--- 8 files changed, 9 insertions(+), 16 deletions(-) diff --git a/docs/notebooks/analogs.ipynb b/docs/notebooks/analogs.ipynb index b8cc9612c..2c53f01a8 100644 --- a/docs/notebooks/analogs.ipynb +++ b/docs/notebooks/analogs.ipynb @@ -27,7 +27,7 @@ "from xarray.coding.calendar_ops import convert_calendar\n", "\n", "from xclim import analog\n", - "from xclim.testing.helpers import open_dataset" + "from xclim.testing import open_dataset" ] }, { @@ -105,7 +105,7 @@ "outputs": [], "source": [ "fig, axs = plt.subplots(nrows=3, figsize=(6, 6), sharex=True)\n", - "sim_std = convert_calendar(sim, \"default\")\n", + "sim_std = convert_calendar(sim, \"standard\")\n", "obs_chibou = obs.sel(lat=sim.lat, lon=sim.lon, method=\"nearest\")\n", "\n", "for ax, var in zip(axs, obs_chibou.data_vars.keys()):\n", diff --git a/docs/notebooks/cli.ipynb b/docs/notebooks/cli.ipynb index aac26b04e..560ed6771 100644 --- a/docs/notebooks/cli.ipynb +++ b/docs/notebooks/cli.ipynb @@ -90,8 +90,6 @@ "metadata": {}, "outputs": [], "source": [ - "from __future__ import annotations\n", - "\n", "import warnings\n", "\n", "import numpy as np\n", diff --git a/docs/notebooks/customize.ipynb b/docs/notebooks/customize.ipynb index a5f8a9b34..1b1ed7a51 100644 --- a/docs/notebooks/customize.ipynb +++ b/docs/notebooks/customize.ipynb @@ -19,8 +19,7 @@ "\n", "import xarray as xr\n", "\n", - "import xclim\n", - "from xclim.testing import open_dataset" + "import xclim" ] }, { diff --git a/docs/notebooks/ensembles.ipynb b/docs/notebooks/ensembles.ipynb index 10cc60a49..381b3af14 100644 --- a/docs/notebooks/ensembles.ipynb +++ b/docs/notebooks/ensembles.ipynb @@ -155,8 +155,6 @@ }, "outputs": [], "source": [ - "from pathlib import Path\n", - "\n", "import xarray as xr\n", "\n", "# Set display to HTML style (for fancy output)\n", @@ -165,10 +163,10 @@ "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "\n", - "%matplotlib inline\n", - "\n", "from xclim import ensembles\n", "\n", + "%matplotlib inline\n", + "\n", "ens = ensembles.create_ensemble(data_folder.glob(\"ens_tas_m*.nc\")).load()\n", "ens.close()" ] diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index b7d2c4210..71eb91e54 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -54,7 +54,6 @@ "from __future__ import annotations\n", "\n", "import matplotlib.pyplot as plt\n", - "import nc_time_axis\n", "import numpy as np\n", "import xarray as xr\n", "\n", diff --git a/docs/notebooks/units.ipynb b/docs/notebooks/units.ipynb index beaaa6a4b..014180b8a 100644 --- a/docs/notebooks/units.ipynb +++ b/docs/notebooks/units.ipynb @@ -18,7 +18,6 @@ "import xarray as xr\n", "\n", "import xclim\n", - "from xclim import indices\n", "from xclim.core import units\n", "from xclim.testing import open_dataset\n", "\n", diff --git a/docs/notebooks/usage.ipynb b/docs/notebooks/usage.ipynb index 15a535afb..527b016a6 100644 --- a/docs/notebooks/usage.ipynb +++ b/docs/notebooks/usage.ipynb @@ -26,7 +26,7 @@ "import xarray as xr\n", "\n", "import xclim.indices\n", - "from xclim.testing.helpers import open_dataset" + "from xclim.testing import open_dataset" ] }, { diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index e29af99ff..5dd312e4e 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -14,13 +14,13 @@ from dask.callbacks import Callback import xclim +import xclim.testing.utils as xtu from xclim.core import calendar from xclim.core.utils import VARIABLES from xclim.indices import ( longwave_upwelling_radiation_from_net_downwelling, shortwave_upwelling_radiation_from_net_downwelling, ) -from xclim.testing.utils import open_dataset logger = logging.getLogger("xclim") @@ -40,7 +40,7 @@ def generate_atmos( cache_dir: str | os.PathLike[str] | Path, ) -> dict[str, xr.DataArray]: """Create the `atmosds` synthetic testing dataset.""" - with open_dataset( + with xtu.open_dataset( "ERA5/daily_surface_cancities_1990-1993.nc", branch=branch, cache_dir=cache_dir, @@ -67,7 +67,7 @@ def generate_atmos( ds.to_netcdf(atmos_file, engine="h5netcdf") # Give access to dataset variables by name in namespace - with open_dataset( + with xtu.open_dataset( atmos_file, branch=branch, cache_dir=cache_dir, engine="h5netcdf" ) as ds: namespace = {f"{var}_dataset": ds[var] for var in ds.data_vars} From aa0e01f3f6d3052c106934f8183c93cf5de37a60 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:12:38 -0400 Subject: [PATCH 19/21] fix imports in notebooks --- docs/notebooks/extendxclim.ipynb | 2 +- docs/notebooks/sdba-advanced.ipynb | 2 +- docs/notebooks/sdba.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/notebooks/extendxclim.ipynb b/docs/notebooks/extendxclim.ipynb index a07a130ae..d6ef80233 100644 --- a/docs/notebooks/extendxclim.ipynb +++ b/docs/notebooks/extendxclim.ipynb @@ -466,7 +466,7 @@ "metadata": {}, "outputs": [], "source": [ - "from xclim.testing.helpers import open_dataset\n", + "from xclim.testing import open_dataset\n", "\n", "ds = open_dataset(\"ERA5/daily_surface_cancities_1990-1993.nc\")\n", "with xr.set_options(keep_attrs=True):\n", diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index 71eb91e54..642811e90 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -432,7 +432,7 @@ "\n", "import xclim.sdba as sdba\n", "from xclim.core.units import convert_units_to\n", - "from xclim.testing.helpers import open_dataset\n", + "from xclim.testing import open_dataset\n", "\n", "group = sdba.Grouper(\"time.dayofyear\", window=31)\n", "\n", diff --git a/docs/notebooks/sdba.ipynb b/docs/notebooks/sdba.ipynb index 45f4a53a3..72c7df816 100644 --- a/docs/notebooks/sdba.ipynb +++ b/docs/notebooks/sdba.ipynb @@ -457,7 +457,7 @@ "import numpy as np\n", "\n", "from xclim.core.units import convert_units_to\n", - "from xclim.testing.helpers import open_dataset\n", + "from xclim.testing import open_dataset\n", "\n", "dref = open_dataset(\"sdba/ahccd_1950-2013.nc\", drop_variables=[\"lat\", \"lon\"]).sel(\n", " time=slice(\"1981\", \"2010\")\n", From be3645a62e7c105bfe46cc07f49a31924062e0a0 Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:33:06 -0400 Subject: [PATCH 20/21] fix final import --- docs/notebooks/sdba-advanced.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index 642811e90..2c214c9a3 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -751,9 +751,8 @@ "source": [ "from matplotlib import pyplot as plt\n", "\n", - "import xclim as xc\n", "from xclim import sdba\n", - "from xclim.testing.helpers import open_dataset\n", + "from xclim.testing import open_dataset\n", "\n", "# load test data\n", "hist = open_dataset(\"sdba/CanESM2_1950-2100.nc\").sel(time=slice(\"1950\", \"1980\")).tasmax\n", From 5b589f5d9d122c082764c8587488f1eaf2649b1e Mon Sep 17 00:00:00 2001 From: Zeitsperre <10819524+Zeitsperre@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:02:58 -0400 Subject: [PATCH 21/21] update CHANGELOG.rst --- CHANGELOG.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index de439ac1b..deb1b6128 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,16 +12,16 @@ Bug fixes Breaking changes ^^^^^^^^^^^^^^^^ -* The ``xclim.testing`` module has been refactored to make use of `pooch` and many now-redundant functions have been removed: (:pull:`1889`) - * ``xclim.testing.utilities.open_dataset`` is now found under ``xclim.testing.helpers.open_dataset`` and uses a `pooch` instance to deliver locally-stored datasets. Its call signature has also changed. - * ``xclim.testing.utilities.get_file``, ``xclim.testing.utilities.get_local_testdata``, ``xclim.testing.utilities.list_datasets``, and ``xclim.testing.utilities.file_md5_checksum`` have been removed. - * ``xclim.testing.helpers.nimbus`` replaces much of this functionality. See the `xclim` documentation for more information. -* The `Ouranosinc/xclim-testdata` repository has been restructured for better organization and to make better use of `pooch` and data registries for testing data fetching. (:pull:`1889`). * `platformdirs` is no longer a direct dependency of `xclim`, but `pooch` is required to use many of the new testing functions (installable via `pip install pooch` or `pip install 'xclim[dev]'`). (:pull:`1889`). Internal changes ^^^^^^^^^^^^^^^^ -* The testing data fetching mechanism has been completely rewritten to use `pooch` and file registries. (:pull:`1889`). +* The `Ouranosinc/xclim-testdata` repository has been restructured for better organization and to make better use of `pooch` and data registries for testing data fetching (see: `xclim-testdata PR/29 `_). (:pull:`1889`). +* The ``xclim.testing`` module has been refactored to make use of `pooch` with file registries. Several testing functions have been removed as a result: (:pull:`1889`) + * ``xclim.testing.utils.open_dataset`` now uses a `pooch` instance to deliver locally-stored datasets. Its call signature has also changed. + * ``xclim`` now accepts more environment variables to control the behaviour of the testing setup functions. These include ``XCLIM_TESTDATA_BRANCH``, ``XCLIM_TESTDATA_REPO_URL``, and ``XCLIM_TESTDATA_CACHE_DIR``. + * ``xclim.testing.utils.get_file``, ``xclim.testing.utils.get_local_testdata``, ``xclim.testing.utils.list_datasets``, and ``xclim.testing.utils.file_md5_checksum`` have been removed. + * ``xclim.testing.utils.nimbus`` replaces much of this functionality. See the `xclim` documentation for more information. * Many tests focused on evaluating the normal operation of remote file access tools under ``xclim.testing`` have been removed. (:pull:`1889`). * Setup and teardown functions that were found under ``tests/conftest.py`` have been optimized to reduce redundant calls when running ``pytest xclim``. Some obsolete `pytest` fixtures have also been removed.(:pull:`1889`).