Skip to content

Commit

Permalink
Smaller fixes and better tests, merging of private fork (#12)
Browse files Browse the repository at this point in the history
  • Loading branch information
s-scherrer authored Aug 30, 2022
1 parent b414466 commit cacc9a9
Show file tree
Hide file tree
Showing 17 changed files with 298 additions and 126 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@ Changelog
Unreleased
==========

-
v0.1.2
======

- more features for ``DirectoryImageReader``
- handling of "image" files with multiple time steps
- better documentation for subclassing
- renaming of other readers
- ``XarrayImageStackReader`` to ``StackImageReader``
- ``XarrayTSReader`` to ``StackTs``
- ``repurpose`` function for image readers
- improved test coverage

v0.1.0
======
Expand Down
4 changes: 2 additions & 2 deletions src/qa4sm_preprocessing/reading/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .image import DirectoryImageReader
from .stack import XarrayImageStackReader
from .timeseries import XarrayTSReader, GriddedNcOrthoMultiTs
from .stack import StackImageReader
from .timeseries import StackTs, GriddedNcOrthoMultiTs
from .transpose import write_transposed_dataset
6 changes: 3 additions & 3 deletions src/qa4sm_preprocessing/reading/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from .utils import mkdate


class XarrayReaderBase:
class ReaderBase:
"""
Base class for readers backed by xarray objects (images, image stacks,
timeseries).
Expand Down Expand Up @@ -239,7 +239,7 @@ def finalize_grid(self, grid):
num_gpis = len(grid.activegpis)
logging.debug(f"finalize_grid: Number of active gpis: {num_gpis}")

if hasattr(self, "cellsize"):
if hasattr(self, "cellsize"): # pragma: no branch
if self.cellsize is None:
# Automatically set a suitable cell size, aiming at cell sizes
# of about 30**2 pixels.
Expand All @@ -249,7 +249,7 @@ def finalize_grid(self, grid):
grid = grid.to_cell_grid(cellsize=self.cellsize)
num_cells = len(grid.get_cells())
logging.debug(
f"_grid_from_xarray: Number of grid cells: {num_cells}"
f"finalize_grid: Number of grid cells: {num_cells}"
)

return grid
Expand Down
49 changes: 13 additions & 36 deletions src/qa4sm_preprocessing/reading/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

from repurpose.img2ts import Img2Ts

from . import XarrayImageStackReader, DirectoryImageReader
from . import StackImageReader, DirectoryImageReader
from .transpose import write_transposed_dataset
from .utils import mkdate, str2bool

Expand Down Expand Up @@ -211,40 +211,24 @@ def __init__(self, description):
default=True,
help="Whether to use compression or not. Default is true",
)
self.add_argument(
"--memory",
type=float,
default=2,
help="The amount of memory to use as buffer in GB",
)


class RepurposeArgumentParser(ReaderArgumentParser):
def __init__(self):
super().__init__("Converts data to time series format.")
self.prog = "repurpose_images"
self.add_argument(
"--imgbuffer",
type=int,
default=365,
help=(
"How many images to read at once. Bigger "
"numbers make the conversion faster but "
"consume more memory. Default is 365."
),
)
self.add_argument(
"--cellsize",
type=float,
default=5.0,
help=("Size of single file cells. Default is 5.0."),
)


class TransposeArgumentParser(ReaderArgumentParser):
def __init__(self):
super().__init__("Converts data to transposed netCDF.")
self.prog = "transpose_images"
self.add_argument(
"--memory",
type=float,
default=2,
help="The amount of memory to use as buffer in GB",
)
self.add_argument(
"--n_threads",
type=int,
Expand Down Expand Up @@ -300,7 +284,7 @@ def parse_args(parser, args):

input_path = Path(args.dataset_root)
if input_path.is_file():
reader = XarrayImageStackReader(
reader = StackImageReader(
input_path,
args.parameter,
**common_reader_kwargs,
Expand All @@ -327,19 +311,12 @@ def repurpose(args):
outpath = Path(args.output_root)
outpath.mkdir(exist_ok=True, parents=True)

reshuffler = Img2Ts(
input_dataset=reader,
outputpath=args.output_root,
startdate=args.start,
enddate=args.end,
ts_attributes=reader.global_attrs,
zlib=args.zlib,
imgbuffer=args.imgbuffer,
# this is necessary currently due to bug in repurpose
cellsize_lat=args.cellsize,
cellsize_lon=args.cellsize,
reader.repurpose(
args.output_root,
start=args.start,
end=args.end,
memory=args.memory
)
reshuffler.calc()


def transpose(args):
Expand Down
26 changes: 13 additions & 13 deletions src/qa4sm_preprocessing/reading/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
The ``DirectoryImageReader`` aims to provide an easy to use class to read
directories of single images, to either create a single image stack file, a
transposed stack (via ``write_transposed_dataset``), or a cell-based timeseries
dataset (via the repurpose package).
dataset (via the ``repurpose`` method).
The advantage over ``xr.open_mfdataset`` is that the reader typically does not
need to open each dataset to get information on coordinates. Instead, the
Expand Down Expand Up @@ -35,15 +35,16 @@
* ``_metadata_from_dataset``: If metadata cannot be read from the files.
* ``_tstamps_in_file``: If the timestamp cannot be inferred from the filename
but via other info specific to the dataset this can be used to avoid having
to reading all files only to get the timestamps.
to read all files only to get the timestamps.
* ``_landmask_from_dataset``: If a landmask is required (only if the ``read``
function is used), and it cannot be read with ``_open_dataset`` and also not
with other options. Should not be necessary very often.
* ``_read_single_file``: normally this calls ``_open_dataset`` and then returns
the data as dictionary that maps from variable names to 3d data arrays (numpy
or dask). If it is hard to read the data as xr.Dataset, so that overriding
`_open_dataset` is not feasible, this could be overriden instead, but then
all the other routines for obtaining metadata also have to be overriden.
all the other routines for obtaining grid/metadata/landmask info also have to
be overriden.
In the following some examples for subclassing are provided.
Expand Down Expand Up @@ -87,7 +88,7 @@ def __init__(self, directory):
It is often necessary to preprocess the data before using it. For example, many
datasets contain quality flags as additional variable that need to be applied
to mask out unreliable data. Another example would be a case where one is
interested in a sum of multiple variables. In this case it is necessary to
interested in a sum of multiple variables. In these cases it is necessary to
override the ``_open_dataset`` method.
As an example, consider that we have images containing a field "soil_moisture"
Expand All @@ -106,9 +107,9 @@ def __init__(self, directory):
)
def _open_dataset(self, fname):
ds = super()_open_dataset(fname)
ds = super()._open_dataset(fname)
qc = ds["quality_flag"]
# We check if the first bit is zero by doing a bitwise and with 1.
# We check if the first bit is zero by doing a bitwise AND with 1.
# The result is 0 if the first bit is zero, and 1 otherwise.
valid = (qc & 2**0) == 0
return ds[["soil_moisture"]].where(valid)
Expand Down Expand Up @@ -151,8 +152,7 @@ def __init__(self, directory):
fmt="%Y%m%d",
time_regex_pattern=r"SMAP_L3_SM_P_([0-9]+)_R.*.h5",
pattern="**/*.h5",
use_tqdm=True, # for nice progress bar
#
# there are 2 timestamps in each file
timestamps=[pd.Timedelta("6H"), pd.Timedelta("18H")]
)
Expand Down Expand Up @@ -198,7 +198,7 @@ def _latlon_from_dataset(self, fname):
don't contain timestamps, it might be necessary to also override
`_tstamps_in_file`, `_metadata_from_dataset`, or
`_landmask_from_dataset`. Since these are edge cases, they are not shown in
detail here, but it works similarly to the other examples.
detail here, but it works similar to the other examples.
"""

import cftime
Expand All @@ -214,12 +214,12 @@ def _latlon_from_dataset(self, fname):
import warnings
import xarray as xr

from .imagebase import XarrayImageReaderBase
from .imagebase import ImageReaderBase
from .base import LevelSelectionMixin
from .exceptions import ReaderError


class DirectoryImageReader(LevelSelectionMixin, XarrayImageReaderBase):
class DirectoryImageReader(LevelSelectionMixin, ImageReaderBase):
r"""
Image reader for a directory containing netcdf files.
Expand Down Expand Up @@ -637,7 +637,7 @@ def _read_block(
# If we have to average multiple images to a single image, we will
# read image by image
block_dict = {varname: [] for varname in self.varnames}
if self.use_tqdm:
if self.use_tqdm: # pragma: no branch
times = tqdm(times)
for tstamp in times:
# read all sub-images that have to be averaged later on
Expand Down Expand Up @@ -696,7 +696,7 @@ def _read_all_files(self, times, use_tqdm):
# now we can open each file and extract the timestamps we need
block_dict = {varname: [] for varname in self.varnames}
iterator = file_tstamp_map.items()
if use_tqdm:
if use_tqdm: # pragma: no branch
iterator = tqdm(iterator)
for fname, tstamps in iterator:
_blockdict = self._read_single_file(fname, tstamps)
Expand Down
19 changes: 10 additions & 9 deletions src/qa4sm_preprocessing/reading/imagebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import datetime
import numpy as np
from pathlib import Path
import shutil
from typing import Union, Iterable, List, Tuple, Sequence, Dict
import xarray as xr

Expand All @@ -11,11 +12,11 @@

from .exceptions import ReaderError
from .utils import mkdate, nimages_for_memory
from .base import XarrayReaderBase
from .base import ReaderBase
from .timeseries import GriddedNcOrthoMultiTs


class XarrayImageReaderBase(XarrayReaderBase):
class ImageReaderBase(ReaderBase):
"""
Base class for image readers backed by xarray objects (multiple single
images or single stack of multiple images).
Expand All @@ -38,11 +39,11 @@ def _validate_start_end(
) -> Tuple[datetime.datetime]:
if start is None:
start = self.timestamps[0]
elif isinstance(start, str):
elif isinstance(start, str): # pragma: no cover
start = mkdate(start)
if end is None:
end = self.timestamps[-1]
elif isinstance(end, str):
elif isinstance(end, str): # pragma: no cover
end = mkdate(end)
return start, end

Expand Down Expand Up @@ -206,7 +207,7 @@ def read(
------
KeyError
"""
if isinstance(timestamp, str):
if isinstance(timestamp, str): # pragma: no cover
timestamp = mkdate(timestamp)

if timestamp not in self.timestamps: # pragma: no cover
Expand Down Expand Up @@ -268,13 +269,13 @@ def repurpose(
"""
outpath = Path(outpath)
start, end = self._validate_start_end(start, end)
if outpath.exists() and overwrite:
if (outpath / "grid.nc").exists() and overwrite:
shutil.rmtree(outpath)
if not outpath.exists(): # if overwrite=True, it was deleted now
if not (outpath / "grid.nc").exists(): # if overwrite=True, it was deleted now
outpath.mkdir(exist_ok=True, parents=True)
testimg = self._testimg()
n = nimages_for_memory(testimg, memory)
if hasattr(self, "use_tqdm"):
if hasattr(self, "use_tqdm"): # pragma: no branch
orig_tqdm = self.use_tqdm
self.use_tqdm = False
reshuffler = Img2Ts(
Expand All @@ -290,7 +291,7 @@ def repurpose(
imgbuffer=n,
)
reshuffler.calc()
if hasattr(self, "use_tqdm"):
if hasattr(self, "use_tqdm"): # pragma: no branch
self.use_tqdm = orig_tqdm
reader = GriddedNcOrthoMultiTs(str(outpath), timevarname=timevarname, read_bulk=True)
return reader
4 changes: 2 additions & 2 deletions src/qa4sm_preprocessing/reading/stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from typing import Iterable, Union
import xarray as xr

from .imagebase import XarrayImageReaderBase
from .imagebase import ImageReaderBase


class XarrayImageStackReader(XarrayImageReaderBase):
class StackImageReader(ImageReaderBase):
"""
Image reader that wraps a xarray.Dataset.
Expand Down
Loading

0 comments on commit cacc9a9

Please sign in to comment.