Smaller fixes and better tests, merging of private fork (#12)

awst-austria · Aug 30, 2022 · cacc9a9 · cacc9a9
1 parent b414466
commit cacc9a9
Show file tree

Hide file tree

Showing 17 changed files with 298 additions and 126 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -5,7 +5,17 @@ Changelog
 Unreleased
 ==========
 
--
+v0.1.2
+======
+
+- more features for ``DirectoryImageReader``
+  - handling of "image" files with multiple time steps
+  - better documentation for subclassing
+- renaming of other readers
+  - ``XarrayImageStackReader`` to ``StackImageReader``
+  - ``XarrayTSReader`` to ``StackTs``
+- ``repurpose`` function for image readers
+- improved test coverage
 
 v0.1.0
 ======

diff --git a/src/qa4sm_preprocessing/reading/__init__.py b/src/qa4sm_preprocessing/reading/__init__.py
@@ -1,4 +1,4 @@
 from .image import DirectoryImageReader
-from .stack import XarrayImageStackReader
-from .timeseries import XarrayTSReader, GriddedNcOrthoMultiTs
+from .stack import StackImageReader
+from .timeseries import StackTs, GriddedNcOrthoMultiTs
 from .transpose import write_transposed_dataset
diff --git a/src/qa4sm_preprocessing/reading/base.py b/src/qa4sm_preprocessing/reading/base.py
@@ -15,7 +15,7 @@
 from .utils import mkdate
 
 
-class XarrayReaderBase:
+class ReaderBase:
     """
     Base class for readers backed by xarray objects (images, image stacks,
     timeseries).
@@ -239,7 +239,7 @@ def finalize_grid(self, grid):
         num_gpis = len(grid.activegpis)
         logging.debug(f"finalize_grid: Number of active gpis: {num_gpis}")
 
-        if hasattr(self, "cellsize"):
+        if hasattr(self, "cellsize"):  # pragma: no branch
             if self.cellsize is None:
                 # Automatically set a suitable cell size, aiming at cell sizes
                 # of about 30**2 pixels.
@@ -249,7 +249,7 @@ def finalize_grid(self, grid):
             grid = grid.to_cell_grid(cellsize=self.cellsize)
             num_cells = len(grid.get_cells())
             logging.debug(
-                f"_grid_from_xarray: Number of grid cells: {num_cells}"
+                f"finalize_grid: Number of grid cells: {num_cells}"
             )
 
         return grid

diff --git a/src/qa4sm_preprocessing/reading/cli.py b/src/qa4sm_preprocessing/reading/cli.py
@@ -31,7 +31,7 @@
 
 from repurpose.img2ts import Img2Ts
 
-from . import XarrayImageStackReader, DirectoryImageReader
+from . import StackImageReader, DirectoryImageReader
 from .transpose import write_transposed_dataset
 from .utils import mkdate, str2bool
 
@@ -211,40 +211,24 @@ def __init__(self, description):
             default=True,
             help="Whether to use compression or not. Default is true",
         )
+        self.add_argument(
+            "--memory",
+            type=float,
+            default=2,
+            help="The amount of memory to use as buffer in GB",
+        )
 
 
 class RepurposeArgumentParser(ReaderArgumentParser):
     def __init__(self):
         super().__init__("Converts data to time series format.")
         self.prog = "repurpose_images"
-        self.add_argument(
-            "--imgbuffer",
-            type=int,
-            default=365,
-            help=(
-                "How many images to read at once. Bigger "
-                "numbers make the conversion faster but "
-                "consume more memory. Default is 365."
-            ),
-        )
-        self.add_argument(
-            "--cellsize",
-            type=float,
-            default=5.0,
-            help=("Size of single file cells. Default is 5.0."),
-        )
 
 
 class TransposeArgumentParser(ReaderArgumentParser):
     def __init__(self):
         super().__init__("Converts data to transposed netCDF.")
         self.prog = "transpose_images"
-        self.add_argument(
-            "--memory",
-            type=float,
-            default=2,
-            help="The amount of memory to use as buffer in GB",
-        )
         self.add_argument(
             "--n_threads",
             type=int,
@@ -300,7 +284,7 @@ def parse_args(parser, args):
 
     input_path = Path(args.dataset_root)
     if input_path.is_file():
-        reader = XarrayImageStackReader(
+        reader = StackImageReader(
             input_path,
             args.parameter,
             **common_reader_kwargs,
@@ -327,19 +311,12 @@ def repurpose(args):
     outpath = Path(args.output_root)
     outpath.mkdir(exist_ok=True, parents=True)
 
-    reshuffler = Img2Ts(
-        input_dataset=reader,
-        outputpath=args.output_root,
-        startdate=args.start,
-        enddate=args.end,
-        ts_attributes=reader.global_attrs,
-        zlib=args.zlib,
-        imgbuffer=args.imgbuffer,
-        # this is necessary currently due to bug in repurpose
-        cellsize_lat=args.cellsize,
-        cellsize_lon=args.cellsize,
+    reader.repurpose(
+        args.output_root,
+        start=args.start,
+        end=args.end,
+        memory=args.memory
     )
-    reshuffler.calc()
 
 
 def transpose(args):

diff --git a/src/qa4sm_preprocessing/reading/image.py b/src/qa4sm_preprocessing/reading/image.py
@@ -5,7 +5,7 @@
 The ``DirectoryImageReader`` aims to provide an easy to use class to read
 directories of single images, to either create a single image stack file, a
 transposed stack (via ``write_transposed_dataset``), or a cell-based timeseries
-dataset (via the repurpose package).
+dataset (via the ``repurpose`` method).
 
 The advantage over ``xr.open_mfdataset`` is that the reader typically does not
 need to open each dataset to get information on coordinates. Instead, the
@@ -35,15 +35,16 @@
 * ``_metadata_from_dataset``: If metadata cannot be read from the files.
 * ``_tstamps_in_file``: If the timestamp cannot be inferred from the filename
   but via other info specific to the dataset this can be used to avoid having
-  to reading all files only to get the timestamps.
+  to read all files only to get the timestamps.
 * ``_landmask_from_dataset``: If a landmask is required (only if the ``read``
   function is used), and it cannot be read with ``_open_dataset`` and also not
   with other options. Should not be necessary very often.
 * ``_read_single_file``: normally this calls ``_open_dataset`` and then returns
   the data as dictionary that maps from variable names to 3d data arrays (numpy
   or dask). If it is hard to read the data as xr.Dataset, so that overriding
   `_open_dataset` is not feasible, this could be overriden instead, but then
-  all the other routines for obtaining metadata also have to be overriden.
+  all the other routines for obtaining grid/metadata/landmask info also have to
+  be overriden.
 
 In the following some examples for subclassing are provided.
 
@@ -87,7 +88,7 @@ def __init__(self, directory):
 It is often necessary to preprocess the data before using it. For example, many
 datasets contain quality flags as additional variable that need to be applied
 to mask out unreliable data. Another example would be a case where one is
-interested in a sum of multiple variables.  In this case it is necessary to
+interested in a sum of multiple variables.  In these cases it is necessary to
 override the ``_open_dataset`` method.
 
 As an example, consider that we have images containing a field "soil_moisture"
@@ -106,9 +107,9 @@ def __init__(self, directory):
             )
 
         def _open_dataset(self, fname):
-            ds = super()_open_dataset(fname)
+            ds = super()._open_dataset(fname)
             qc = ds["quality_flag"]
-            # We check if the first bit is zero by doing a bitwise and with 1.
+            # We check if the first bit is zero by doing a bitwise AND with 1.
             # The result is 0 if the first bit is zero, and 1 otherwise.
             valid = (qc & 2**0) == 0
             return ds[["soil_moisture"]].where(valid)
@@ -151,8 +152,7 @@ def __init__(self, directory):
                 fmt="%Y%m%d",
                 time_regex_pattern=r"SMAP_L3_SM_P_([0-9]+)_R.*.h5",
                 pattern="**/*.h5",
-                use_tqdm=True,   # for nice progress bar
-                #
+                # there are 2 timestamps in each file
                 timestamps=[pd.Timedelta("6H"), pd.Timedelta("18H")]
             )
 
@@ -198,7 +198,7 @@ def _latlon_from_dataset(self, fname):
 don't contain timestamps, it might be necessary to also override
 `_tstamps_in_file`, `_metadata_from_dataset`, or
 `_landmask_from_dataset`. Since these are edge cases, they are not shown in
-detail here, but it works similarly to the other examples.
+detail here, but it works similar to the other examples.
 """
 
 import cftime
@@ -214,12 +214,12 @@ def _latlon_from_dataset(self, fname):
 import warnings
 import xarray as xr
 
-from .imagebase import XarrayImageReaderBase
+from .imagebase import ImageReaderBase
 from .base import LevelSelectionMixin
 from .exceptions import ReaderError
 
 
-class DirectoryImageReader(LevelSelectionMixin, XarrayImageReaderBase):
+class DirectoryImageReader(LevelSelectionMixin, ImageReaderBase):
     r"""
     Image reader for a directory containing netcdf files.
 
@@ -637,7 +637,7 @@ def _read_block(
             # If we have to average multiple images to a single image, we will
             # read image by image
             block_dict = {varname: [] for varname in self.varnames}
-            if self.use_tqdm:
+            if self.use_tqdm:  # pragma: no branch
                 times = tqdm(times)
             for tstamp in times:
                 # read all sub-images that have to be averaged later on
@@ -696,7 +696,7 @@ def _read_all_files(self, times, use_tqdm):
         # now we can open each file and extract the timestamps we need
         block_dict = {varname: [] for varname in self.varnames}
         iterator = file_tstamp_map.items()
-        if use_tqdm:
+        if use_tqdm:  # pragma: no branch
             iterator = tqdm(iterator)
         for fname, tstamps in iterator:
             _blockdict = self._read_single_file(fname, tstamps)

diff --git a/src/qa4sm_preprocessing/reading/imagebase.py b/src/qa4sm_preprocessing/reading/imagebase.py
@@ -3,6 +3,7 @@
 import datetime
 import numpy as np
 from pathlib import Path
+import shutil
 from typing import Union, Iterable, List, Tuple, Sequence, Dict
 import xarray as xr
 
@@ -11,11 +12,11 @@
 
 from .exceptions import ReaderError
 from .utils import mkdate, nimages_for_memory
-from .base import XarrayReaderBase
+from .base import ReaderBase
 from .timeseries import GriddedNcOrthoMultiTs
 
 
-class XarrayImageReaderBase(XarrayReaderBase):
+class ImageReaderBase(ReaderBase):
     """
     Base class for image readers backed by xarray objects (multiple single
     images or single stack of multiple images).
@@ -38,11 +39,11 @@ def _validate_start_end(
     ) -> Tuple[datetime.datetime]:
         if start is None:
             start = self.timestamps[0]
-        elif isinstance(start, str):
+        elif isinstance(start, str):  # pragma: no cover
             start = mkdate(start)
         if end is None:
             end = self.timestamps[-1]
-        elif isinstance(end, str):
+        elif isinstance(end, str):  # pragma: no cover
             end = mkdate(end)
         return start, end
 
@@ -206,7 +207,7 @@ def read(
         ------
         KeyError
         """
-        if isinstance(timestamp, str):
+        if isinstance(timestamp, str):  # pragma: no cover
             timestamp = mkdate(timestamp)
 
         if timestamp not in self.timestamps:  # pragma: no cover
@@ -268,13 +269,13 @@ def repurpose(
         """
         outpath = Path(outpath)
         start, end = self._validate_start_end(start, end)
-        if outpath.exists() and overwrite:
+        if (outpath / "grid.nc").exists() and overwrite:
             shutil.rmtree(outpath)
-        if not outpath.exists():  # if overwrite=True, it was deleted now
+        if not (outpath / "grid.nc").exists():  # if overwrite=True, it was deleted now
             outpath.mkdir(exist_ok=True, parents=True)
             testimg = self._testimg()
             n = nimages_for_memory(testimg, memory)
-            if hasattr(self, "use_tqdm"):
+            if hasattr(self, "use_tqdm"): # pragma: no branch
                 orig_tqdm = self.use_tqdm
                 self.use_tqdm = False
             reshuffler = Img2Ts(
@@ -290,7 +291,7 @@ def repurpose(
                 imgbuffer=n,
             )
             reshuffler.calc()
-            if hasattr(self, "use_tqdm"):
+            if hasattr(self, "use_tqdm"):  # pragma: no branch
                 self.use_tqdm = orig_tqdm
         reader = GriddedNcOrthoMultiTs(str(outpath), timevarname=timevarname, read_bulk=True)
         return reader
diff --git a/src/qa4sm_preprocessing/reading/stack.py b/src/qa4sm_preprocessing/reading/stack.py
@@ -5,10 +5,10 @@
 from typing import Iterable, Union
 import xarray as xr
 
-from .imagebase import XarrayImageReaderBase
+from .imagebase import ImageReaderBase
 
 
-class XarrayImageStackReader(XarrayImageReaderBase):
+class StackImageReader(ImageReaderBase):
     """
     Image reader that wraps a xarray.Dataset.