Skip to content

Commit

Permalink
Fix 248 (#249)
Browse files Browse the repository at this point in the history
* allow creation via a list of in memory xr.Datasets

* add test for creation via xr.Datasets, add test for no time dim

* update history.rst

* split subroutrine in create_ensemble into _ens_checktimes and _ens_aligntimes
  • Loading branch information
tlogan2000 authored Jul 3, 2019
1 parent 0ffe8f2 commit 5657b4f
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 53 deletions.
2 changes: 2 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ History
* Pre-commit is now used to launch code formatting inspections for local development.
* Documentation now includes more detailed usage and an example workflow notebook.
* Development build configurations are now available via both Anaconda and pip install methods.
* Modified create_ensembles() to allow creation of ensemble dataset without a time dimension as well as from xr.Datasets
* Modified create ensembles() to pad input data with nans when time dimensions are unequal

0.10-beta (2019-06-06)
----------------------
Expand Down
64 changes: 61 additions & 3 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,74 @@ class TestEnsembleStats:
)
nc_files = glob.glob(os.path.join(TESTS_DATA, "EnsembleStats", "*.nc"))

def test_checktimes(self):
time_flag, time_all = ensembles._ens_checktimes(self.nc_files)
assert time_flag
assert pd.DatetimeIndex(time_all).min() == pd.Timestamp("1950-01-01 00:00:00")
assert pd.DatetimeIndex(time_all).max() == pd.Timestamp("2100-01-01 00:00:00")

# verify short time-series file
time_flag, time_all1 = ensembles._ens_checktimes(
[i for i in self.nc_files if "1970-2050" in i]
)
assert time_flag
assert pd.DatetimeIndex(time_all1).min() > pd.DatetimeIndex(time_all).min()
assert pd.DatetimeIndex(time_all1).max() < pd.DatetimeIndex(time_all).max()

# no time
ds = xr.open_dataset(self.nc_files[0])
ds = ds.groupby(ds.time.dt.month).mean("time", keep_attrs=True)
time_flag, time_all = ensembles._ens_checktimes([ds])
assert not time_flag
assert time_all is None

def test_create_ensemble(self):
ens = ensembles.create_ensemble(self.nc_files_simple)
assert len(ens.realization) == len(self.nc_files_simple)

# create again using xr.Dataset objects
ds_all = []
for n in self.nc_files_simple:
ds = xr.open_dataset(n, decode_times=False)
ds["time"] = xr.decode_cf(ds).time
ds_all.append(ds)

ens1 = ensembles.create_ensemble(ds_all)
coords = list(ens1.coords)
coords.extend(list(ens1.data_vars))
for c in coords:
np.testing.assert_array_equal(ens[c], ens1[c])

def test_no_time(self):

# create again using xr.Dataset objects
ds_all = []
for n in self.nc_files_simple:
ds = xr.open_dataset(n, decode_times=False)
ds["time"] = xr.decode_cf(ds).time
ds_all.append(ds.groupby(ds.time.dt.month).mean("time", keep_attrs=True))

ens = ensembles.create_ensemble(ds_all)
assert len(ens.realization) == len(self.nc_files_simple)

def test_create_unequal_times(self):
ens = ensembles.create_ensemble(self.nc_files)
assert len(ens.realization) == len(self.nc_files)
assert ens.time.dt.year.min() == 1970
assert ens.time.dt.year.max() == 2050
assert ens.time.dt.year.min() == 1950
assert ens.time.dt.year.max() == 2100

ii = [i for i, s in enumerate(self.nc_files) if "1970-2050" in s]
# assert padded with nans
assert np.all(
np.isnan(ens.tg_mean.isel(realization=ii).sel(time=ens.time.dt.year < 1970))
)
assert np.all(
np.isnan(ens.tg_mean.isel(realization=ii).sel(time=ens.time.dt.year > 2050))
)

ens_mean = ens.tg_mean.mean(dim=["realization", "lon", "lat"], skipna=False)
assert ens_mean.where(~np.isnan(ens_mean), drop=True).time.dt.year.min() == 1970
assert ens_mean.where(~np.isnan(ens_mean), drop=True).time.dt.year.max() == 2050

def test_calc_perc(self):
ens = ensembles.create_ensemble(self.nc_files_simple)
Expand Down Expand Up @@ -323,7 +382,6 @@ def test_delayed(self):
assert isinstance(txc.data, dask.array.core.Array)

def test_identifier(self):

with pytest.warns(UserWarning):
UniIndPr(identifier="t_{}")

Expand Down
184 changes: 134 additions & 50 deletions xclim/ensembles.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,23 @@
import xarray as xr


def create_ensemble(ncfiles, mf_flag=False):
def create_ensemble(datasets, mf_flag=False):
"""Create an xarray datset of ensemble of climate simulation from a list of netcdf files. Input data is
concatenated along a newly created data dimension ('realization')
Returns a xarray dataset object containing input data from the list of netcdf files concatenated along
a new dimension (name:'realization'). In the case where input files have unequal time dimensions output
ensemble dataset is created for overlapping time-steps common to all input files
ensemble dataset is created for maximum time-step interval of all input files. Before concatenation datasets not
covering the entire time span have their data padded with `nan` values
Parameters
----------
ncfiles : sequence
List of netcdf file paths. If mf_flag is true ncfiles should be a list of lists where
datasets : sequence
List of netcdf file paths or xr.Datasets . If mf_flag is true ncfiles should be a list of lists where
each sublist contains input .nc files of a multifile dataset
mf_flag : Boolean . If true climate simulations are treated as multifile datasets before concatenation
mf_flag : Boolean . If True climate simulations are treated as multifile datasets before concatenation.
Only applicable when `datasets` is a sequence of file paths
Returns
-------
Expand All @@ -33,62 +35,25 @@ def create_ensemble(ncfiles, mf_flag=False):
>>> from xclim import ensembles
>>> import glob
>>> ncfiles = glob.glob('/*.nc')
>>> ens = ensembles.create_ensemble(ncfiles)
>>> ens = ensembles.create_ensemble(datasets)
>>> print(ens)
Using multifile datasets:
simulation 1 is a list of .nc files (e.g. separated by time)
>>> ncfiles = glob.glob('dir/*.nc')
>>> datasets = glob.glob('dir/*.nc')
simulation 2 is also a list of .nc files
>>> ens = utils.create_ensemble(ncfiles)
>>> datasets.append(glob.glob('dir2/*.nc'))
>>> ens = utils.create_ensemble(datasets, mf_flag=True)
"""

dim = "realization"
ds1 = []
start_end_flag = True
# print('finding common time-steps')
for n in ncfiles:
if mf_flag:
ds = xr.open_mfdataset(
n, concat_dim="time", decode_times=False, chunks={"time": 10}
)
ds["time"] = xr.open_mfdataset(n).time
else:
ds = xr.open_dataset(n, decode_times=False)
ds["time"] = xr.decode_cf(ds).time
# get times - use common
time1 = pd.to_datetime(
{"year": ds.time.dt.year, "month": ds.time.dt.month, "day": ds.time.dt.day}
)
if start_end_flag:
start1 = time1.values[0]
end1 = time1.values[-1]
start_end_flag = False
if time1.values.min() > start1:
start1 = time1.values.min()
if time1.values.max() < end1:
end1 = time1.values.max()

for n in ncfiles:
# print('accessing file ', ncfiles.index(n) + 1, ' of ', len(ncfiles))
if mf_flag:
ds = xr.open_mfdataset(
n, concat_dim="time", decode_times=False, chunks={"time": 10}
)
ds["time"] = xr.open_mfdataset(n).time
else:
ds = xr.open_dataset(n, decode_times=False, chunks={"time": 10})
ds["time"] = xr.decode_cf(ds).time

ds["time"].values = pd.to_datetime(
{"year": ds.time.dt.year, "month": ds.time.dt.month, "day": ds.time.dt.day}
)
time_flag, time_all = _ens_checktimes(datasets, mf_flag)

ds = ds.where((ds.time >= start1) & (ds.time <= end1), drop=True)
ds1 = _ens_align_datasets(datasets, mf_flag, time_flag, time_all)

ds1.append(ds.drop("time"))
# print('concatenating files : adding dimension ', dim, )
ens = xr.concat(ds1, dim=dim)
# assign time coords
ens = ens.assign_coords(time=ds.time.values)

return ens


Expand Down Expand Up @@ -201,6 +166,125 @@ def ensemble_percentiles(ens, values=(10, 50, 90), time_block=None):
return ds_out


def _ens_checktimes(datasets, mf_flag=False):
"""Check list of datasets and determine if they hava a time dimension. If present return the
maximum time-step interval of all input files
Parameters
----------
datasets : sequence
List of netcdf file paths or xr.Datasets . If mf_flag is true ncfiles should be a list of lists where
each sublist contains input .nc files of a multifile dataset
mf_flag : Boolean . If True climate simulations are treated as multifile datasets before concatenation.
Only applicable when `datasets` is a sequence of file paths
Returns
-------
time_flag : bool; True if time dimension is present in the dataset list otherwise false.
time_all : array of datetime64; Series of unique time-steps covering all input datasets.
"""

time_flag = False
time_all = []
for n in datasets:
if mf_flag:
ds = xr.open_mfdataset(
n, concat_dim="time", decode_times=False, chunks={"time": 10}
)
else:
if isinstance(n, xr.Dataset):
ds = n
else:
ds = xr.open_dataset(n, decode_times=False)

if hasattr(ds, "time"):
ds["time"] = xr.decode_cf(ds).time
time_flag = True

# get times - use common
time1 = pd.to_datetime(
{
"year": ds.time.dt.year,
"month": ds.time.dt.month,
"day": ds.time.dt.day,
}
)

time_all.extend(time1.values)
if time_flag:
time_all = pd.unique(time_all)
time_all.sort()
else:
time_all = None
return time_flag, time_all


def _ens_align_datasets(datasets, mf_flag=False, time_flag=False, time_all=None):
"""Create a list of aligned xr.Datasets for ensemble dataset creation. If `time_flag == True` input datasets are
given a common time dimension defined by `time_all`. Datasets not covering the entire time span have their data
padded with `nan` values
Parameters
----------
datasets : sequence
List of netcdf file paths or xr.Datasets . If mf_flag is true ncfiles should be a list of lists where
each sublist contains input .nc files of a multifile dataset
mf_flag : Boolean . If True climate simulations are treated as multifile datasets before concatenation.
Only applicable when `datasets` is a sequence of file paths
time_flag : bool; True if time dimension is present in the dataset list otherwise false.
time_all : array of datetime64; Series of unique time-steps covering all input datasets.
Returns
-------
ds_all : list; list of xr.Datasets
"""

ds_all = []
for n in datasets:
# print('accessing file ', ncfiles.index(n) + 1, ' of ', len(ncfiles))
if mf_flag:
ds = xr.open_mfdataset(
n, concat_dim="time", decode_times=False, chunks={"time": 10}
)
else:
if isinstance(n, xr.Dataset):
ds = n
else:
ds = xr.open_dataset(n, decode_times=False)

if time_flag:

ds["time"] = xr.decode_cf(ds).time

ds["time"].values = pd.to_datetime(
{
"year": ds.time.dt.year,
"month": ds.time.dt.month,
"day": ds.time.dt.day,
}
)
# if dataset does not have the same time steps pad with nans
if ds.time.min() > time_all.min() or ds.time.max() < time_all.max():
coords = {}
for c in [c for c in ds.coords if not "time" in c]:
coords[c] = ds.coords[c]
coords["time"] = time_all
dsTmp = xr.Dataset(data_vars=None, coords=coords, attrs=ds.attrs)
for v in ds.data_vars:
dsTmp[v] = ds[v]
ds = dsTmp
# ds = ds.where((ds.time >= start1) & (ds.time <= end1), drop=True)
ds_all.append(ds)

return ds_all


def _calc_percentiles_simple(ens, v, values):
ds_out = ens.drop(ens.data_vars)
dims = list(ens[v].dims)
Expand Down

0 comments on commit 5657b4f

Please sign in to comment.