From 45a65f08d0cb5e80a443c6054b59287f94ec48a6 Mon Sep 17 00:00:00 2001 From: Philipp A Date: Thu, 4 Jan 2024 14:04:21 +0100 Subject: [PATCH] Fix IO error reporting (#1273) --- .pre-commit-config.yaml | 2 + anndata/_core/anndata.py | 2 +- anndata/_io/h5ad.py | 19 ++++--- anndata/_io/specs/registry.py | 81 +++++++++++++--------------- anndata/_io/utils.py | 63 ++++++++++++---------- anndata/_io/zarr.py | 2 +- anndata/compat/__init__.py | 17 +++++- anndata/experimental/_dispatch_io.py | 4 +- anndata/tests/test_awkward.py | 13 ++--- anndata/tests/test_io_utils.py | 37 +++++++++---- anndata/tests/test_readwrite.py | 2 +- docs/release-notes/0.10.4.md | 1 + pyproject.toml | 2 +- 13 files changed, 140 insertions(+), 105 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f9626c3d4..2ce52a6d7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,8 @@ repos: rev: v4.0.0-alpha.8 hooks: - id: prettier + exclude_types: + - markdown - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index b37e33177..a2b010bdf 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -74,7 +74,7 @@ class StorageType(Enum): DaskArray = DaskArray CupyArray = CupyArray CupySparseMatrix = CupySparseMatrix - BackedSparseMAtrix = BaseCompressedSparseDataset + BackedSparseMatrix = BaseCompressedSparseDataset @classmethod def classes(cls): diff --git a/anndata/_io/h5ad.py b/anndata/_io/h5ad.py index 337f13f92..5f31da04a 100644 --- a/anndata/_io/h5ad.py +++ b/anndata/_io/h5ad.py @@ -6,6 +6,7 @@ from types import MappingProxyType from typing import ( TYPE_CHECKING, + Any, Callable, Literal, TypeVar, @@ -112,7 +113,13 @@ def write_h5ad( @report_write_key_on_error @write_spec(IOSpec("array", "0.2.0")) -def write_sparse_as_dense(f, key, value, dataset_kwargs=MappingProxyType({})): +def write_sparse_as_dense( + f: h5py.Group, + key: str, + value: sparse.spmatrix | BaseCompressedSparseDataset, + *, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): real_key = None # Flag for if temporary key was used if key in f: if isinstance(value, BaseCompressedSparseDataset) and ( @@ -269,7 +276,7 @@ def callback(func, elem_name: str, elem, iospec): def _read_raw( f: h5py.File | AnnDataFileManager, as_sparse: Collection[str] = (), - rdasp: Callable[[h5py.Dataset], sparse.spmatrix] = None, + rdasp: Callable[[h5py.Dataset], sparse.spmatrix] | None = None, *, attrs: Collection[str] = ("X", "var", "varm"), ) -> dict: @@ -286,7 +293,7 @@ def _read_raw( @report_read_key_on_error -def read_dataframe_legacy(dataset) -> pd.DataFrame: +def read_dataframe_legacy(dataset: h5py.Dataset) -> pd.DataFrame: """Read pre-anndata 0.7 dataframes.""" warn( f"'{dataset.name}' was written with a very old version of AnnData. " @@ -305,7 +312,7 @@ def read_dataframe_legacy(dataset) -> pd.DataFrame: return df -def read_dataframe(group) -> pd.DataFrame: +def read_dataframe(group: h5py.Group | h5py.Dataset) -> pd.DataFrame: """Backwards compat function""" if not isinstance(group, h5py.Group): return read_dataframe_legacy(group) @@ -352,7 +359,7 @@ def read_dense_as_sparse( raise ValueError(f"Cannot read dense array as type: {sparse_format}") -def read_dense_as_csr(dataset, axis_chunk=6000): +def read_dense_as_csr(dataset: h5py.Dataset, axis_chunk: int = 6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 0, axis_chunk): dense_chunk = dataset[idx] @@ -361,7 +368,7 @@ def read_dense_as_csr(dataset, axis_chunk=6000): return sparse.vstack(sub_matrices, format="csr") -def read_dense_as_csc(dataset, axis_chunk=6000): +def read_dense_as_csc(dataset: h5py.Dataset, axis_chunk: int = 6000): sub_matrices = [] for idx in idx_chunks_along_axis(dataset.shape, 1, axis_chunk): sub_matrix = sparse.csc_matrix(dataset[idx]) diff --git a/anndata/_io/specs/registry.py b/anndata/_io/specs/registry.py index 1f0b137f4..a8357295d 100644 --- a/anndata/_io/specs/registry.py +++ b/anndata/_io/specs/registry.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Callable, Iterable, Mapping +from collections.abc import Mapping from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType @@ -10,12 +10,13 @@ from anndata.compat import _read_attr if TYPE_CHECKING: + from collections.abc import Callable, Generator, Iterable + from anndata._types import GroupStorageType, StorageType + # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" # TODO: Should filetype be included in the IOSpec if it changes the encoding? Or does the intent that these things be "the same" overrule that? - - @dataclass(frozen=True) class IOSpec: encoding_type: str @@ -25,7 +26,9 @@ class IOSpec: # TODO: Should this subclass from LookupError? class IORegistryError(Exception): @classmethod - def _from_write_parts(cls, dest_type, typ, modifiers) -> IORegistryError: + def _from_write_parts( + cls, dest_type: type, typ: type, modifiers: frozenset[str] + ) -> IORegistryError: msg = f"No method registered for writing {typ} into {dest_type}" if modifiers: msg += f" with {modifiers}" @@ -36,7 +39,7 @@ def _from_read_parts( cls, method: str, registry: Mapping, - src_typ: StorageType, + src_typ: type[StorageType], spec: IOSpec, ) -> IORegistryError: # TODO: Improve error message if type exists, but version does not @@ -50,7 +53,7 @@ def _from_read_parts( def write_spec(spec: IOSpec): def decorator(func: Callable): @wraps(func) - def wrapper(g, k, *args, **kwargs): + def wrapper(g: GroupStorageType, k: str, *args, **kwargs): result = func(g, k, *args, **kwargs) g[k].attrs.setdefault("encoding-type", spec.encoding_type) g[k].attrs.setdefault("encoding-version", spec.encoding_version) @@ -193,12 +196,12 @@ def proc_spec(spec) -> IOSpec: @proc_spec.register(IOSpec) -def proc_spec_spec(spec) -> IOSpec: +def proc_spec_spec(spec: IOSpec) -> IOSpec: return spec @proc_spec.register(Mapping) -def proc_spec_mapping(spec) -> IOSpec: +def proc_spec_mapping(spec: Mapping[str, str]) -> IOSpec: return IOSpec(**{k.replace("-", "_"): v for k, v in spec.items()}) @@ -213,7 +216,9 @@ def get_spec( ) -def _iter_patterns(elem): +def _iter_patterns( + elem, +) -> Generator[tuple[type, type | str] | tuple[type, type, str], None, None]: """Iterates over possible patterns for an element in order of precedence.""" from anndata.compat import DaskArray @@ -236,40 +241,27 @@ def __init__(self, registry: IORegistry, callback: Callable | None = None) -> No def read_elem( self, elem: StorageType, - modifiers: frozenset(str) = frozenset(), + modifiers: frozenset[str] = frozenset(), ) -> Any: """Read an element from a store. See exported function for more details.""" from functools import partial - read_func = self.registry.get_reader( - type(elem), get_spec(elem), frozenset(modifiers) + iospec = get_spec(elem) + read_func = partial( + self.registry.get_reader(type(elem), iospec, modifiers), + _reader=self, ) - read_func = partial(read_func, _reader=self) - if self.callback is not None: - return self.callback(read_func, elem.name, elem, iospec=get_spec(elem)) - else: + if self.callback is None: return read_func(elem) + return self.callback(read_func, elem.name, elem, iospec=iospec) class Writer: - def __init__( - self, - registry: IORegistry, - callback: Callable[ - [ - GroupStorageType, - str, - StorageType, - dict, - ], - None, - ] - | None = None, - ): + def __init__(self, registry: IORegistry, callback: Callable | None = None): self.registry = registry self.callback = callback - def find_writer(self, dest_type, elem, modifiers): + def find_writer(self, dest_type: type, elem, modifiers: frozenset[str]): for pattern in _iter_patterns(elem): if self.registry.has_writer(dest_type, pattern, modifiers): return self.registry.get_writer(dest_type, pattern, modifiers) @@ -281,10 +273,10 @@ def write_elem( self, store: GroupStorageType, k: str, - elem, + elem: Any, *, - dataset_kwargs=MappingProxyType({}), - modifiers=frozenset(), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + modifiers: frozenset[str] = frozenset(), ): from functools import partial from pathlib import PurePosixPath @@ -313,17 +305,16 @@ def write_elem( _writer=self, ) - if self.callback is not None: - return self.callback( - write_func, - store, - k, - elem, - dataset_kwargs=dataset_kwargs, - iospec=self.registry.get_spec(elem), - ) - else: + if self.callback is None: return write_func(store, k, elem, dataset_kwargs=dataset_kwargs) + return self.callback( + write_func, + store, + k, + elem, + dataset_kwargs=dataset_kwargs, + iospec=self.registry.get_spec(elem), + ) def read_elem(elem: StorageType) -> Any: @@ -346,7 +337,7 @@ def write_elem( k: str, elem: Any, *, - dataset_kwargs: Mapping = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: """ Write an element to a storage group using anndata encoding. diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py index cd90be473..6f46ad7eb 100644 --- a/anndata/_io/utils.py +++ b/anndata/_io/utils.py @@ -1,15 +1,19 @@ from __future__ import annotations from functools import wraps -from typing import Callable, Literal +from typing import TYPE_CHECKING, Callable, Literal, Union, cast from warnings import warn import h5py from packaging.version import Version -from anndata.compat import H5Group, ZarrGroup, add_note - from .._core.sparse_dataset import BaseCompressedSparseDataset +from ..compat import H5Group, ZarrGroup, add_note, pairwise + +if TYPE_CHECKING: + from .._types import StorageType + + Storage = Union[StorageType, BaseCompressedSparseDataset] # For allowing h5py v3 # https://github.com/scverse/anndata/issues/442 @@ -151,31 +155,26 @@ class AnnDataReadError(OSError): pass -def _get_parent(elem): - try: - import zarr - except ImportError: - zarr = None - if zarr and isinstance(elem, (zarr.Group, zarr.Array)): - parent = elem.store # Not sure how to always get a name out of this - elif isinstance(elem, BaseCompressedSparseDataset): - parent = elem.group.file.name - else: - parent = elem.file.name - return parent +def _get_display_path(store: Storage) -> str: + """Return an absolute path of an element (always starts with “/”).""" + if isinstance(store, BaseCompressedSparseDataset): + store = store.group + path = store.name or "??" # can be None + return f'/{path.removeprefix("/")}' -def add_key_note(e: BaseException, elem, key, op=Literal["read", "writ"]) -> None: +def add_key_note( + e: BaseException, store: Storage, path: str, key: str, op: Literal["read", "writ"] +) -> None: if any( f"Error raised while {op}ing key" in note for note in getattr(e, "__notes__", []) ): return - parent = _get_parent(elem) - add_note( - e, - f"Error raised while {op}ing key {key!r} of {type(elem)} to " f"{parent}", - ) + + dir = "to" if op == "writ" else "from" + msg = f"Error raised while {op}ing key {key!r} of {type(store)} {dir} {path}" + add_note(e, msg) def report_read_key_on_error(func): @@ -198,13 +197,17 @@ def func_wrapper(*args, **kwargs): from anndata._io.specs import Reader # Figure out signature (method vs function) by going through args - for elem in args: - if not isinstance(elem, Reader): + for arg in args: + if not isinstance(arg, Reader): + store = cast("Storage", arg) break + else: + raise ValueError("No element found in args.") try: return func(*args, **kwargs) except Exception as e: - add_key_note(e, elem, elem.name, "read") + path, key = _get_display_path(store).rsplit("/", 1) + add_key_note(e, store, path or "/", key, "read") raise return func_wrapper @@ -230,15 +233,17 @@ def func_wrapper(*args, **kwargs): from anndata._io.specs import Writer # Figure out signature (method vs function) by going through args - for i in range(len(args)): - elem = args[i] - key = args[i + 1] - if not isinstance(elem, Writer): + for arg, key in pairwise(args): + if not isinstance(arg, Writer): + store = cast("Storage", arg) break + else: + raise ValueError("No element found in args.") try: return func(*args, **kwargs) except Exception as e: - add_key_note(e, elem, key, "writ") + path = _get_display_path(store) + add_key_note(e, store, path, key, "writ") raise return func_wrapper diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py index 00f9766f0..864475848 100644 --- a/anndata/_io/zarr.py +++ b/anndata/_io/zarr.py @@ -133,7 +133,7 @@ def read_dataframe_legacy(dataset: zarr.Array) -> pd.DataFrame: @report_read_key_on_error -def read_dataframe(group) -> pd.DataFrame: +def read_dataframe(group: zarr.Group | zarr.Array) -> pd.DataFrame: # Fast paths if isinstance(group, zarr.Array): return read_dataframe_legacy(group) diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 24f3875b3..39323d73a 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import sys from codecs import decode from collections.abc import Mapping from contextlib import AbstractContextManager @@ -35,9 +36,9 @@ class Empty: ############################# -try: +if sys.version_info >= (3, 11): from contextlib import chdir -except ImportError: # Python < 3.11 +else: @dataclass class chdir(AbstractContextManager): @@ -52,6 +53,18 @@ def __exit__(self, *_exc_info) -> None: os.chdir(self._old_cwd.pop()) +if sys.version_info >= (3, 10): + from itertools import pairwise +else: + + def pairwise(iterable): + from itertools import tee + + a, b = tee(iterable) + next(b, None) + return zip(a, b) + + ############################# # Optional deps ############################# diff --git a/anndata/experimental/_dispatch_io.py b/anndata/experimental/_dispatch_io.py index 4df4d417a..2a399d540 100644 --- a/anndata/experimental/_dispatch_io.py +++ b/anndata/experimental/_dispatch_io.py @@ -4,6 +4,8 @@ from typing import TYPE_CHECKING, Any, Callable if TYPE_CHECKING: + from collections.abc import Mapping + from anndata._io.specs import IOSpec from anndata._types import GroupStorageType, StorageType @@ -55,7 +57,7 @@ def write_dispatched( None, ], *, - dataset_kwargs=MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: """ Write elem to store, recursively calling callback at each sub-element. diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 9e780c8a8..2e996bfc6 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -1,7 +1,7 @@ """Tests related to awkward arrays""" from __future__ import annotations -from contextlib import nullcontext +import warnings import numpy as np import numpy.testing as npt @@ -381,15 +381,12 @@ def test_concat_mixed_types(key, arrays, expected, join): to_concat.append(tmp_adata) if isinstance(expected, type) and issubclass(expected, Exception): - ctx = ( - pytest.warns( + with pytest.raises(expected), warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", FutureWarning, - match=r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated", ) - if any(df.empty for df in arrays if isinstance(df, pd.DataFrame)) - else nullcontext() - ) - with pytest.raises(expected), ctx: anndata.concat(to_concat, axis=axis, join=join) else: result_adata = anndata.concat(to_concat, axis=axis, join=join) diff --git a/anndata/tests/test_io_utils.py b/anndata/tests/test_io_utils.py index c70091474..803a4ad72 100644 --- a/anndata/tests/test_io_utils.py +++ b/anndata/tests/test_io_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations -from contextlib import suppress +from contextlib import AbstractContextManager, suppress +from typing import TYPE_CHECKING import h5py import pandas as pd @@ -9,13 +10,15 @@ import anndata as ad from anndata._io.specs.registry import IORegistryError -from anndata._io.utils import ( - report_read_key_on_error, -) +from anndata._io.utils import report_read_key_on_error from anndata.compat import _clean_uns from anndata.experimental import read_elem, write_elem from anndata.tests.helpers import pytest_8_raises +if TYPE_CHECKING: + from collections.abc import Callable + from pathlib import Path + @pytest.fixture(params=["h5ad", "zarr"]) def diskfmt(request): @@ -29,20 +32,32 @@ def diskfmt(request): pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"), ], ) -def test_key_error(tmp_path, group_fn): +@pytest.mark.parametrize("nested", [True, False], ids=["nested", "root"]) +def test_key_error( + tmp_path, group_fn: Callable[[Path], zarr.Group | h5py.Group], nested: bool +): @report_read_key_on_error def read_attr(_): raise NotImplementedError() group = group_fn(tmp_path) - with group if hasattr(group, "__enter__") else suppress(): + with group if isinstance(group, AbstractContextManager) else suppress(): + if nested: + group = group.create_group("nested") + path = "/nested" + else: + path = "/" group["X"] = [1, 2, 3] group.create_group("group") - with pytest_8_raises(NotImplementedError, match=r"/X"): + with pytest_8_raises( + NotImplementedError, match=rf"reading key 'X'.*from {path}$" + ): read_attr(group["X"]) - with pytest_8_raises(NotImplementedError, match=r"/group"): + with pytest_8_raises( + NotImplementedError, match=rf"reading key 'group'.*from {path}$" + ): read_attr(group["group"]) @@ -53,7 +68,9 @@ def test_write_error_info(diskfmt, tmp_path): # Assuming we don't define a writer for tuples a = ad.AnnData(uns={"a": {"b": {"c": (1, 2, 3)}}}) - with pytest_8_raises(IORegistryError, match=r"Error raised while writing key 'c'"): + with pytest_8_raises( + IORegistryError, match=r"Error raised while writing key 'c'.*to /uns/a/b" + ): write(a) @@ -89,7 +106,7 @@ class Foo: # (?!...) is a negative lookahead # (?s) enables the dot to match newlines # https://stackoverflow.com/a/406408/130164 <- copilot suggested lol - pattern = r"(?s)((?!Error raised while writing key '/?a').)*$" + pattern = r"(?s)^((?!Error raised while writing key '/?a').)*$" with pytest_8_raises(IORegistryError, match=pattern): write_elem(group, "/", {"a": {"b": Foo()}}) diff --git a/anndata/tests/test_readwrite.py b/anndata/tests/test_readwrite.py index 22b1aaffc..67341fe57 100644 --- a/anndata/tests/test_readwrite.py +++ b/anndata/tests/test_readwrite.py @@ -277,7 +277,7 @@ def test_read_full_io_error(tmp_path, name, read, write): store["obs"].attrs["encoding-type"] = "invalid" with pytest_8_raises( IORegistryError, - match=r"raised while reading key '/obs'", + match=r"raised while reading key 'obs'.*from /$", ) as exc_info: read(path) assert re.search( diff --git a/docs/release-notes/0.10.4.md b/docs/release-notes/0.10.4.md index 2739f5c7f..ee1560ebe 100644 --- a/docs/release-notes/0.10.4.md +++ b/docs/release-notes/0.10.4.md @@ -7,6 +7,7 @@ * `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold` * `adata.X[mask]` works in newer `numpy` versions when `X` is `backed` {pr}`1255` {user}`ilan-gold` * `adata.X[...]` fixed for `X` as a `BaseCompressedSparseDataset` with `zarr` backend {pr}`1265` {user}`ilan-gold` +* Improve read/write error reporting {pr}`1273` {user}`flying-sheep` ```{rubric} Documentation ``` diff --git a/pyproject.toml b/pyproject.toml index c705aff43..ffb275bde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,7 @@ doc = [ ] test = [ "loompy>=3.0.5", - "pytest>=6.0", + "pytest >=6.0, !=8.0.0rc1", # https://github.com/pytest-dev/pytest/issues/11759 "pytest-cov>=2.10", "zarr", "matplotlib",