pydata · kmuehlbauer · Apr 1, 2023 · Mar 31, 2023 · Mar 31, 2023 · Apr 1, 2023
diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -78,6 +78,71 @@ def __repr__(self) -> str:
         )
 
 
+class NativeEndiannessArray(indexing.ExplicitlyIndexedNDArrayMixin):
+    """Decode arrays on the fly from non-native to native endianness
+
+    This is useful for decoding arrays from netCDF3 files (which are all
+    big endian) into native endianness, so they can be used with Cython
+    functions, such as those found in bottleneck and pandas.
+
+    >>> x = np.arange(5, dtype=">i2")
+
+    >>> x.dtype
+    dtype('>i2')
+
+    >>> NativeEndiannessArray(x).dtype
+    dtype('int16')
+
+    >>> indexer = indexing.BasicIndexer((slice(None),))
+    >>> NativeEndiannessArray(x)[indexer].dtype
+    dtype('int16')
+    """
+
+    __slots__ = ("array",)
+
+    def __init__(self, array) -> None:
+        self.array = indexing.as_indexable(array)
+
+    @property
+    def dtype(self) -> np.dtype:
+        return np.dtype(self.array.dtype.kind + str(self.array.dtype.itemsize))
+
+    def __getitem__(self, key) -> np.ndarray:
+        return np.asarray(self.array[key], dtype=self.dtype)
+
+
+class BoolTypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
+    """Decode arrays on the fly from integer to boolean datatype
+
+    This is useful for decoding boolean arrays from integer typed netCDF
+    variables.
+
+    >>> x = np.array([1, 0, 1, 1, 0], dtype="i1")
+
+    >>> x.dtype
+    dtype('int8')
+
+    >>> BoolTypeArray(x).dtype
+    dtype('bool')
+
+    >>> indexer = indexing.BasicIndexer((slice(None),))
+    >>> BoolTypeArray(x)[indexer].dtype
+    dtype('bool')
+    """
+
+    __slots__ = ("array",)
+
+    def __init__(self, array) -> None:
+        self.array = indexing.as_indexable(array)
+
+    @property
+    def dtype(self) -> np.dtype:
+        return np.dtype("bool")
+
+    def __getitem__(self, key) -> np.ndarray:
+        return np.asarray(self.array[key], dtype=self.dtype)
+
+
 def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
     """Lazily apply an element-wise function to an array.
     Parameters
@@ -155,34 +220,39 @@ class CFMaskCoder(VariableCoder):
     def encode(self, variable: Variable, name: T_Name = None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
+        # get dtype from encoding if available, otherwise use data.dtype
         dtype = np.dtype(encoding.get("dtype", data.dtype))
         fv = encoding.get("_FillValue")
         mv = encoding.get("missing_value")
 
-        if (
-            fv is not None
-            and mv is not None
-            and not duck_array_ops.allclose_or_equiv(fv, mv)
-        ):
-            raise ValueError(
-                f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
-            )
-
-        if fv is not None:
-            # Ensure _FillValue is cast to same dtype as data's
-            encoding["_FillValue"] = dtype.type(fv)
-            fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
-            if not pd.isnull(fill_value):
-                data = duck_array_ops.fillna(data, fill_value)
-
-        if mv is not None:
-            # Ensure missing_value is cast to same dtype as data's
-            encoding["missing_value"] = dtype.type(mv)
-            fill_value = pop_to(encoding, attrs, "missing_value", name=name)
-            if not pd.isnull(fill_value) and fv is None:
-                data = duck_array_ops.fillna(data, fill_value)
+        if fv is not None or mv is not None:
+            if (
+                fv is not None
+                and mv is not None
+                and not duck_array_ops.allclose_or_equiv(fv, mv)
+            ):
+                raise ValueError(
+                    f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
+                )
 
-        return Variable(dims, data, attrs, encoding, fastpath=True)
+            if fv is not None:
+                # Ensure _FillValue is cast to same dtype as data's
+                encoding["_FillValue"] = dtype.type(fv)
+                fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
+                if not pd.isnull(fill_value):
+                    data = duck_array_ops.fillna(data, fill_value)
+
+            if mv is not None:
+                # Use _FillValue if available to align missing_value to prevent issues
+                # when decoding
+                # Ensure missing_value is cast to same dtype as data's
+                encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
+                fill_value = pop_to(encoding, attrs, "missing_value", name=name)
+                if not pd.isnull(fill_value) and fv is None:
+                    data = duck_array_ops.fillna(data, fill_value)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
 
     def decode(self, variable: Variable, name: T_Name = None):
         dims, data, attrs, encoding = unpack_for_decoding(variable)
@@ -207,7 +277,13 @@ def decode(self, variable: Variable, name: T_Name = None):
                     stacklevel=3,
                 )
 
-            dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
+            if "scale_factor" not in attrs and "add_offset" not in attrs:
+                dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
+            else:
+                dtype, decoded_fill_value = (
+                    _choose_float_dtype(data.dtype, attrs),
+                    np.nan,
+                )
 
             if encoded_fill_values:
                 transform = partial(
@@ -232,20 +308,50 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
     return data
 
 
-def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
-    """Return a float dtype that can losslessly represent `dtype` values."""
-    # Keep float32 as-is.  Upcast half-precision to single-precision,
+def _choose_float_dtype(
+    dtype: np.dtype, mapping: MutableMapping
+) -> type[np.floating[Any]]:
+    # check scale/offset first to derive wanted float dtype
+    # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
+    scale_factor = mapping.get("scale_factor")
+    add_offset = mapping.get("add_offset")
+    if scale_factor is not None or add_offset is not None:
+        # get the type from scale_factor/add_offset to determine
+        # the needed floating point type
+        if scale_factor is not None:
+            scale_type = type(scale_factor)
+        if add_offset is not None:
+            offset_type = type(add_offset)
+        # CF conforming, both scale_factor and add-offset are given and
+        # of same floating point type (float32/64)
+        if (
+            add_offset is not None
+            and scale_factor is not None
+            and offset_type == scale_type
+            and scale_type in [np.float32, np.float64]
+        ):
+            # in case of int32 -> we need upcast to float64
+            # due to precision issues
+            if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
+                return np.float64
+            return np.dtype(scale_type).type
+        # Not CF conforming and add_offset given:
+        # A scale factor is entirely safe (vanishing into the mantissa),
+        # but a large integer offset could lead to loss of precision.
+        # Sensitivity analysis can be tricky, so we just use a float64
+        # if there's any offset at all - better unoptimised than wrong!
+        if add_offset is not None:
+            return np.float64
+        # return float32 in other cases where only scale_factor is given
+        return np.float32
+    # If no scale_factor or add_offset is given, use some general rules.
+    # Keep float32 as-is. Upcast half-precision to single-precision,
     # because float16 is "intended for storage but not computation"
     if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
         return np.float32
     # float32 can exactly represent all integers up to 24 bits
     if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
-        # A scale factor is entirely safe (vanishing into the mantissa),
-        # but a large integer offset could lead to loss of precision.
-        # Sensitivity analysis can be tricky, so we just use a float64
-        # if there's any offset at all - better unoptimised than wrong!
-        if not has_offset:
-            return np.float32
+        return np.float32
     # For all other types and circumstances, we just use float64.
     # (safe because eg. complex numbers are not supported in NetCDF)
     return np.float64
@@ -260,29 +366,40 @@ class CFScaleOffsetCoder(VariableCoder):
 
     def encode(self, variable: Variable, name: T_Name = None) -> Variable:
         dims, data, attrs, encoding = unpack_for_encoding(variable)
-
-        if "scale_factor" in encoding or "add_offset" in encoding:
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
+        scale_factor = pop_to(encoding, attrs, "scale_factor", name=name)
+        add_offset = pop_to(encoding, attrs, "add_offset", name=name)
+        if scale_factor or add_offset:
+            # if we have a _FillValue/masked_value we do not want to cast now
+            # but leave that to CFMaskCoder
+            dtype = data.dtype
+            if "_FillValue" not in encoding and "missing_value" not in encoding:
+                dtype = _choose_float_dtype(data.dtype, attrs)
+            # but still we need a copy prevent changing original data
             data = data.astype(dtype=dtype, copy=True)
-        if "add_offset" in encoding:
-            data -= pop_to(encoding, attrs, "add_offset", name=name)
-        if "scale_factor" in encoding:
-            data /= pop_to(encoding, attrs, "scale_factor", name=name)
-
-        return Variable(dims, data, attrs, encoding, fastpath=True)
+            if add_offset:
+                data -= add_offset
+            if scale_factor:
+                data /= scale_factor
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
 
     def decode(self, variable: Variable, name: T_Name = None) -> Variable:
-        _attrs = variable.attrs
-        if "scale_factor" in _attrs or "add_offset" in _attrs:
-            dims, data, attrs, encoding = unpack_for_decoding(variable)
-
-            scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
-            add_offset = pop_to(attrs, encoding, "add_offset", name=name)
-            dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+        scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
+        add_offset = pop_to(attrs, encoding, "add_offset", name=name)
+        if scale_factor or add_offset:
             if np.ndim(scale_factor) > 0:
                 scale_factor = np.asarray(scale_factor).item()
             if np.ndim(add_offset) > 0:
                 add_offset = np.asarray(add_offset).item()
+            # if we have a _FillValue/masked_value we already have the wanted
+            # floating point dtype here (via CFMaskCoder), so no check is necessary
+            # only check in other cases
+            dtype = data.dtype
+            if "_FillValue" not in encoding and "missing_value" not in encoding:
+                dtype = _choose_float_dtype(dtype, encoding)
+
             transform = partial(
                 _scale_offset_decoding,
                 scale_factor=scale_factor,
@@ -349,3 +466,100 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
             return Variable(dims, data, attrs, encoding, fastpath=True)
         else:
             return variable
+
+
+class DefaultFillvalueCoder(VariableCoder):
+    """Encode default _FillValue if needed."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        dims, data, attrs, encoding = unpack_for_encoding(variable)
+        # make NaN the fill value for float types
+        if (
+            "_FillValue" not in attrs
+            and "_FillValue" not in encoding
+            and np.issubdtype(variable.dtype, np.floating)
+        ):
+            attrs["_FillValue"] = variable.dtype.type(np.nan)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        raise NotImplementedError()
+
+
+class BooleanCoder(VariableCoder):
+    """Code boolean values."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if (
+            (variable.dtype == bool)
+            and ("dtype" not in variable.encoding)
+            and ("dtype" not in variable.attrs)
+        ):
+            dims, data, attrs, encoding = unpack_for_encoding(variable)
+            attrs["dtype"] = "bool"
+            data = duck_array_ops.astype(data, dtype="i1", copy=True)
+
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if variable.attrs.get("dtype", False) == "bool":
+            dims, data, attrs, encoding = unpack_for_decoding(variable)
+            # overwrite (!) encoding accordingly and remove from attrs
+            encoding["dtype"] = attrs.pop("dtype")
+            data = BoolTypeArray(data)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+
+class EndianCoder(VariableCoder):
+    """Decode Endianness to native."""
+
+    def encode(self):
+        raise NotImplementedError()
+
+    def decode(self, variable: Variable, name: T_Name = None) -> Variable:
+        dims, data, attrs, encoding = unpack_for_decoding(variable)
+        if not data.dtype.isnative:
+            data = NativeEndiannessArray(data)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+
+class NonStringCoder(VariableCoder):
+    """Encode NonString variables if dtypes differ."""
+
+    def encode(self, variable: Variable, name: T_Name = None) -> Variable:
+        if "dtype" in variable.encoding and variable.encoding["dtype"] not in (
+            "S1",
+            str,
+        ):
+            dims, data, attrs, encoding = unpack_for_encoding(variable)
+            dtype = np.dtype(encoding.pop("dtype"))
+            if dtype != variable.dtype:
+                if np.issubdtype(dtype, np.integer):
+                    if (
+                        np.issubdtype(variable.dtype, np.floating)
+                        and "_FillValue" not in variable.attrs
+                        and "missing_value" not in variable.attrs
+                    ):
+                        warnings.warn(
+                            f"saving variable {name} with floating "
+                            "point data as an integer dtype without "
+                            "any _FillValue to use for NaNs",
+                            SerializationWarning,
+                            stacklevel=10,
+                        )
+                    data = np.around(data)
+                data = data.astype(dtype=dtype)
+            return Variable(dims, data, attrs, encoding, fastpath=True)
+        else:
+            return variable
+
+    def decode(self):
+        raise NotImplementedError()