Merge branch 'master' into prepare_v0390

Ouranosinc · Nov 1, 2022 · 9830e13 · 9830e13
2 parents 12ebbfb + 932fc7f
commit 9830e13
Show file tree

Hide file tree

Showing 10 changed files with 312 additions and 116 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -13,6 +13,9 @@ New features and enhancements
 * Log-logistic distribution added to `stats.py` for use with ``standardized_precipitation_index`` and ``standardized_precipitation_evapotranspiration_index``. (:issue:`1141`, :pull:`1183`).
 * New option in many indices allowing for resampling in given periods after ``run_length`` operations. (:issue:`505`, :issue:`916`, :issue:`917`, :pull:`1161`).
 * New base indicator class for sdba: ``StatisticalPropertyMeasure``, those measures that also reduce the time (as a property does). (:pull:`1198`).
+* ``xclim.core.calendar.common_calendar`` to find the best calendar to use when uniformizing an heterogeneous collection of data. (:pull:`1217`).
+* ``xclim.ensembles.create_ensemble`` now accepts ``calendar=None``, and uses the above function to guess the best one. It also now accepts ``cal_kwargs`` to fine tune the calendar conversion. (:issue:`1190`, :pull:`1217`).
+* New data check : ``xclim.core.datachecks.check_common_time`` that ensures all inputs of multivariate indicators have the same frequency (and the same time anchoring for daily and hourly data). (:issue:`1111`, :pull:`1217`).
 
 New indicators
 ^^^^^^^^^^^^^^
@@ -41,6 +44,7 @@ Breaking changes
 * The ``freshet_start`` indice is now deprecated in favour of ``first_day_temperature_above`` with `thresh='0 degC', window=5`. The `freshet_start` indicator is now based on ``first_day_temperature_above``, but is otherwise unaffected. (:issue:`1195`, :pull:`1196`).
 * Call signatures for several indices/indicators have been modified to optionally accept `op` for manually setting threshold comparison operators (:issue:`1194`, :pull:`1197`). The affected indices and indicators as follows:
    - ``hot_spell_max_length``, ``hot_spell_frequency``, ``cold_spell_days``, ``cold_spell_frequency``, ``heat_wave_index``, ``warm_day_frequency`` (indice only), ``warm_night_frequency`` (indice only), ``dry_days``, ``wetdays``, ``wetdays_prop``.
+* Cleaner ``xclim.core.calendar.parse_offset`` : fails on invalid frequencies, return implicit anchors (YS -> JAN, Y -> DEC) and implicit ``is_start_anchored`` (D -> True). (:issue:`1213`, , :pull:`1217`).
 
 Bug fixes
 ^^^^^^^^^

diff --git a/setup.py b/setup.py
diff --git a/xclim/core/calendar.py b/xclim/core/calendar.py
@@ -8,8 +8,7 @@
 from __future__ import annotations
 
 import datetime as pydt
-import re
-from typing import Any, NewType, Sequence
+from typing import Any, Sequence
 
 import cftime
 import numpy as np
@@ -146,6 +145,55 @@ def get_calendar(obj: Any, dim: str = "time") -> str:
     raise ValueError(f"Calendar could not be inferred from object of type {type(obj)}.")
 
 
+def common_calendar(calendars: Sequence[str], join="outer") -> str:
+    """Return a calendar common to all calendars from a list.
+
+    Uses the hierarchy: 360_day < noleap < standard < all_leap.
+    Returns "default" only if all calendars are "default."
+
+    Parameters
+    ----------
+    calendars: Sequence of string
+      List of calendar names.
+    join : {'inner', 'outer'}
+      The criterion for the common calendar.
+
+      - 'outer': the common calendar is the smallest calendar (in number of days by year)
+                 that will include all the dates of the other calendars. When converting
+                 the data to this calendar, no timeseries will lose elements, but some
+                 might be missing (gaps or NaNs in the series).
+      - 'inner': the common calender is the smallest calendar of the list. When converting
+                 the data to this calendar, no timeseries will have missing elements (no gaps or NaNs),
+                 but some might be dropped.
+
+    Examples
+    --------
+    >>> common_calendar(["360_day", "noleap", "default"], join="outer")
+    'standard'
+    >>> common_calendar(["360_day", "noleap", "default"], join="inner")
+    '360_day'
+    """
+    if all(cal == "default" for cal in calendars):
+        return "default"
+
+    trans = {
+        "proleptic_gregorian": "standard",
+        "gregorian": "standard",
+        "default": "standard",
+        "366_day": "all_leap",
+        "365_day": "noleap",
+        "julian": "standard",
+    }
+    ranks = {"360_day": 0, "noleap": 1, "standard": 2, "all_leap": 3}
+    calendars = sorted([trans.get(cal, cal) for cal in calendars], key=ranks.get)
+
+    if join == "outer":
+        return calendars[-1]
+    if join == "inner":
+        return calendars[0]
+    raise NotImplementedError(f"Unknown join criterion `{join}`.")
+
+
 def convert_calendar(
     source: xr.DataArray | xr.Dataset,
     target: xr.DataArray | str,
@@ -613,19 +661,28 @@ def parse_offset(freq: str) -> Sequence[str]:
 
     Returns
     -------
-    multiplier (int), offset base (str), is start anchored (bool), anchor (str or None)
-      "[n]W" is always replaced with "[7n]D", as xarray doesn't support "W" for cftime indexes.
-      "Y" is always replaced with "A".
+    multiplier : int
+      Multiplier of the base frequency. "[n]W" is always replaced with "[7n]D", as xarray doesn't support "W" for cftime indexes.
+    offset_base : str
+      Base frequency. "Y" is always replaced with "A".
+    is_start_anchored : bool
+      Whether coordinates of this frequency should correspond to the beginning of the period (`True`) or its end (`False`).
+      Can only be False when base is A, Q or M.
+    anchor : str or None
+      Anchor date for bases A or Q. As xarray doesn't support "W", neither does xclim (anchor information is lost when given).
     """
-    patt = r"(\d*)(\w)(S)?(?:-(\w{2,3}))?"
-    mult, base, start, anchor = re.search(patt, freq).groups()
-    mult = int(mult or "1")
-    base = base.replace("Y", "A")
+    # Useful to raise on invalid freqs, convert Y to A and get default anchor (A, Q)
+    offset = pd.tseries.frequencies.to_offset(freq)
+    base, *anchor = offset.name.split("-")
+    anchor = anchor[0] if len(anchor) > 0 else None
+    start = ("S" in base) or (base[0] not in "AQM")
+    base = base[0]
+    mult = offset.n
     if base == "W":
         mult = 7 * mult
         base = "D"
-        anchor = ""
-    return mult, base, start == "S", anchor
+        anchor = None
+    return mult, base, start, anchor
 
 
 def construct_offset(mult: int, base: str, start_anchored: bool, anchor: str | None):
@@ -638,9 +695,9 @@ def construct_offset(mult: int, base: str, start_anchored: bool, anchor: str | N
     base : str
       The base period string (one char).
     start_anchored: bool
-      If True, adds the "S" flag.
+      If True and base in [Y, A, Q, M], adds the "S" flag.
     anchor: str, optional
-      The month anchor of the offset.
+      The month anchor of the offset. Defaults to JAN for bases AS, Y and QS and to DEC for bases A and Q.
 
     Returns
     -------
@@ -651,7 +708,12 @@ def construct_offset(mult: int, base: str, start_anchored: bool, anchor: str | N
     -----
     This provides the mirror opposite functionality of :py:func:`parse_offset`.
     """
-    return f"{mult if mult > 1 else ''}{base}{'S' if start_anchored else ''}{'-' if anchor else ''}{anchor or ''}"
+    start = "S" if start_anchored and base in "YAQM" else ""
+    if anchor is None and base in "AQY":
+        anchor = "JAN" if start_anchored else "DEC"
+    return (
+        f"{mult if mult > 1 else ''}{base}{start}{'-' if anchor else ''}{anchor or ''}"
+    )
 
 
 def _interpolate_doy_calendar(

diff --git a/xclim/core/datachecks.py b/xclim/core/datachecks.py
@@ -31,6 +31,12 @@ def check_freq(var: xr.DataArray, freq: str | Sequence[str], strict: bool = True
     strict : bool
         Whether multiples of the frequencies are considered invalid or not. With `strict` set to False, a '3H' series
         will not raise an error if freq is set to 'H'.
+
+    Raises
+    ------
+    ValidationError
+        - If the frequency of `var` is not inferrable.
+        - If the frequency of `var` does not match the requested `freq`.
     """
     if isinstance(freq, str):
         freq = [freq]
@@ -59,3 +65,46 @@ def check_daily(var: xr.DataArray):
     This does not check for gaps in series.
     """
     return check_freq(var, "D")
+
+
+@datacheck
+def check_common_time(inputs: Sequence[xr.DataArray]):
+    """Raise an error if the list of inputs doesn't have a single common frequency.
+
+    Raises
+    ------
+    ValidationError
+        - if the frequency of any input can't be inferred
+        - if inputs have different frequencies
+        - if inputs have a daily or hourly frequency, but they are not given at the same time of day.
+
+    Parameters
+    ----------
+    inputs : Sequence of xr.DataArray
+        Input arrays.
+    """
+    # Check all have the same freq
+    freqs = [xr.infer_freq(da.time) for da in inputs]
+    if None in freqs:
+        raise ValidationError(
+            "Unable to infer the frequency of the time series. "
+            "To mute this, set xclim's option data_validation='log'."
+        )
+    if len(set(freqs)) != 1:
+        raise ValidationError(
+            f"Inputs have different frequencies. Got : {freqs}."
+            "To mute this, set xclim's option data_validation='log'."
+        )
+
+    # Check if anchor is the same
+    freq = freqs[0]
+    base = parse_offset(freq)[1]
+    fmt = {"H": ":%M", "D": "%H:%M"}
+    if base in fmt:
+        outs = {da.indexes["time"][0].strftime(fmt[base]) for da in inputs}
+        if len(outs) > 1:
+            raise ValidationError(
+                f"All inputs have the same frequency ({freq}), but they are not anchored on the same minutes (got {outs}). "
+                f"xarray's alignment would silently fail. You can try to fix this with `da.resample('{freq}').mean()`."
+                "To mute this, set xclim's option data_validation='log'."
+            )
diff --git a/xclim/core/indicator.py b/xclim/core/indicator.py
@@ -1273,12 +1273,21 @@ def datacheck(self, **das):
         * assert no temperature has the same value 5 days in a row
 
         This base datacheck checks that the input data has a valid sampling frequency, as given in self.src_freq.
+        If there are multiple inputs, it also checks if they all have the same frequency and the same anchor.
         """
         if self.src_freq is not None:
             for key, da in das.items():
                 if "time" in da.coords and da.time.ndim == 1 and len(da.time) > 3:
                     datachecks.check_freq(da, self.src_freq, strict=True)
 
+            datachecks.check_common_time(
+                [
+                    da
+                    for da in das.values()
+                    if "time" in da.coords and da.time.ndim == 1 and len(da.time) > 3
+                ]
+            )
+
     def __getattr__(self, attr):
         """Return the attribute."""
         if attr in self._cf_names:

diff --git a/xclim/core/units.py b/xclim/core/units.py
@@ -23,7 +23,6 @@
 from .utils import ValidationError
 
 __all__ = [
-    "ValidationError",
     "amount2rate",
     "check_units",
     "convert_units_to",

diff --git a/xclim/ensembles/_base.py b/xclim/ensembles/_base.py
@@ -12,7 +12,7 @@
 import numpy as np
 import xarray as xr
 
-from xclim.core.calendar import convert_calendar, get_calendar
+from xclim.core.calendar import common_calendar, convert_calendar, get_calendar
 from xclim.core.formatting import update_history
 from xclim.core.utils import calc_perc
 
@@ -21,8 +21,9 @@ def create_ensemble(
     datasets: Any,
     mf_flag: bool = False,
     resample_freq: str | None = None,
-    calendar: str = "default",
+    calendar: str | None = None,
     realizations: Sequence[Any] | None = None,
+    cal_kwargs: dict | None = None,
     **xr_kwargs,
 ) -> xr.Dataset:
     """Create an xarray dataset of an ensemble of climate simulation from a list of netcdf files.
@@ -37,27 +38,29 @@ def create_ensemble(
     Parameters
     ----------
     datasets : list or dict or string
-        List of netcdf file paths or xarray Dataset/DataArray objects . If mf_flag is True, ncfiles should be a list of
-        lists where each sublist contains input .nc files of an xarray multifile Dataset.
-        If DataArray objects are passed, they should have a name in order to be transformed into Datasets.
-        A dictionary can be passed instead of a list, in which case the keys are used as coordinates along the new
-        `realization` axis.
-        If a string is passed, it is assumed to be a glob pattern for finding datasets.
+      List of netcdf file paths or xarray Dataset/DataArray objects . If mf_flag is True, ncfiles should be a list of
+      lists where each sublist contains input .nc files of an xarray multifile Dataset.
+      If DataArray objects are passed, they should have a name in order to be transformed into Datasets.
+      A dictionary can be passed instead of a list, in which case the keys are used as coordinates along the new
+      `realization` axis.
+      If a string is passed, it is assumed to be a glob pattern for finding datasets.
     mf_flag : bool
-        If True, climate simulations are treated as xarray multifile Datasets before concatenation.
-        Only applicable when "datasets" is sequence of list of file paths.
+      If True, climate simulations are treated as xarray multifile Datasets before concatenation.
+      Only applicable when "datasets" is sequence of list of file paths.
     resample_freq : Optional[str]
-        If the members of the ensemble have the same frequency but not the same offset, they cannot be properly aligned.
-        If resample_freq is set, the time coordinate of each member will be modified to fit this frequency.
-    calendar : str
-        The calendar of the time coordinate of the ensemble.
-        For conversions involving '360_day', the align_on='date' option is used.
-        See :py:func:`xclim.core.calendar.convert_calendar`.
-        'default' is the standard calendar using np.datetime64 objects.
+      If the members of the ensemble have the same frequency but not the same offset, they cannot be properly aligned.
+      If resample_freq is set, the time coordinate of each member will be modified to fit this frequency.
+    calendar : str, optional
+      The calendar of the time coordinate of the ensemble.
+      By default, the smallest common calendar is chosen. For example, a mixed input of "noleap" and "360_day" will default to "noleap".
+      'default' is the standard calendar using np.datetime64 objects (xarray's "standard" with `use_cftime=False`).
     realizations: sequence, optional
       The coordinate values for the new `realization` axis.
       If None (default), the new axis has a simple integer coordinate.
       This argument shouldn't be used if `datasets` is a glob pattern as the dataset order is random.
+    cal_kwargs : dict, optional
+      Additionnal arguments to pass to py:func:`xclim.core.calendar.convert_calendar`.
+      For conversions involving '360_day', the align_on='date' option is used by default.
     **xr_kwargs
       Any keyword arguments to be given to `xr.open_dataset` when opening the files
       (or to `xr.open_mfdataset` if mf_flag is True)
@@ -101,7 +104,12 @@ def create_ensemble(
         )
 
     ds = _ens_align_datasets(
-        datasets, mf_flag, resample_freq, calendar=calendar, **xr_kwargs
+        datasets,
+        mf_flag,
+        resample_freq,
+        calendar=calendar,
+        cal_kwargs=cal_kwargs or {},
+        **xr_kwargs,
     )
 
     if realizations is None:
@@ -329,6 +337,7 @@ def _ens_align_datasets(
     mf_flag: bool = False,
     resample_freq: str | None = None,
     calendar: str = "default",
+    cal_kwargs: dict | None = None,
     **xr_kwargs,
 ) -> list[xr.Dataset]:
     """Create a list of aligned xarray Datasets for ensemble Dataset creation.
@@ -365,6 +374,7 @@ def _ens_align_datasets(
         datasets = glob(datasets)
 
     ds_all = []
+    calendars = []
     for i, n in enumerate(datasets):
         if mf_flag:
             ds = xr.open_mfdataset(n, combine="by_coords", **xr_kwargs)
@@ -390,14 +400,15 @@ def _ens_align_datasets(
                 time = counts.time
 
             ds["time"] = time
-
-            cal = get_calendar(time)
-            ds = convert_calendar(
-                ds,
-                calendar,
-                align_on="date" if "360_day" in [cal, calendar] else None,
-            )
+            calendars.append(get_calendar(time))
 
         ds_all.append(ds)
 
-    return ds_all
+    if not calendars:
+        # no time
+        return ds_all
+
+    if calendar is None:
+        calendar = common_calendar(calendars, join="outer")
+    cal_kwargs.setdefault("align_on", "date")
+    return [convert_calendar(ds, calendar, **cal_kwargs) for ds in ds_all]