scverse · ivirshup · Jan 11, 2024 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py
@@ -15,6 +15,7 @@
 import collections.abc as cabc
 import warnings
 from abc import ABC
+from functools import cached_property
 from itertools import accumulate, chain
 from typing import TYPE_CHECKING, Literal, NamedTuple
 
@@ -282,11 +283,21 @@ class BaseCompressedSparseDataset(ABC):
 
     def __init__(self, group: h5py.Group | ZarrGroup):
         type(self)._check_group_format(group)
-        self.group = group
+        self._group = group
 
     shape: tuple[int, int]
     """Shape of the matrix."""
 
+    @property
+    def group(self):
-    def group(self):
+    def group(self) -> GroupStorageType:
-    def group(self):
+    def group(self) -> GroupStorageType:
+        return self._group
+
+    @group.setter
+    def group(self, val):
+        raise TypeError(
+            f"Do not reset group on a {type(self)}.  Instead use `sparse_dataset` to make a new class."
+        )
+
     @property
     def backend(self) -> Literal["zarr", "hdf5"]:
         if isinstance(self.group, ZarrGroup):
@@ -431,20 +442,24 @@ def append(self, sparse_matrix: ss.spmatrix):
         indices.resize((orig_data_size + sparse_matrix.indices.shape[0],))
         indices[orig_data_size:] = sparse_matrix.indices
 
+    @cached_property
+    def indptr(self):
+        return self.group["indptr"][...]
+
     def _to_backed(self) -> BackedSparseMatrix:
         format_class = get_backed_class(self.format)
         mtx = format_class(self.shape, dtype=self.dtype)
         mtx.data = self.group["data"]
         mtx.indices = self.group["indices"]
-        mtx.indptr = self.group["indptr"][:]
+        mtx.indptr = self.indptr
         return mtx
 
     def to_memory(self) -> ss.spmatrix:
         format_class = get_memory_class(self.format)
         mtx = format_class(self.shape, dtype=self.dtype)
         mtx.data = self.group["data"][...]
         mtx.indices = self.group["indices"][...]
-        mtx.indptr = self.group["indptr"][...]
+        mtx.indptr = self.indptr
         return mtx
 
 

diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py
@@ -12,6 +12,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import zarr
 from pandas.api.types import is_numeric_dtype
 from scipy import sparse
 
@@ -743,3 +744,22 @@ def shares_memory_sparse(x, y):
         marks=pytest.mark.gpu,
     ),
 ]
+
+
+class AccessTrackingStore(zarr.DirectoryStore):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._access_count = {}
+
+    def __getitem__(self, key):
+        for tracked in self._access_count:
+            if tracked in key:
+                self._access_count[tracked] += 1
+        return super().__getitem__(key)
+
+    def get_access_count(self, key):
+        return self._access_count[key]
+
+    def set_key_trackers(self, keys_to_track):
+        for k in keys_to_track:
+            self._access_count[k] = 0
diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py
@@ -13,7 +13,7 @@
 from anndata._core.anndata import AnnData
 from anndata._core.sparse_dataset import sparse_dataset
 from anndata.experimental import read_dispatched
-from anndata.tests.helpers import assert_equal, subset_func
+from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -164,6 +164,34 @@ def test_dataset_append_disk(
     assert_equal(fromdisk, frommem)
 
 
+@pytest.mark.parametrize(
+    ["sparse_format"],
+    [
+        pytest.param(sparse.csr_matrix),
+        pytest.param(sparse.csc_matrix),
+    ],
+)
+def test_indptr_cache(
+    tmp_path: Path,
+    sparse_format: Callable[[ArrayLike], sparse.spmatrix],
+):
+    path = tmp_path / "test.zarr"  # diskfmt is either h5ad or zarr
+    a = sparse_format(sparse.random(10, 10))
+    f = zarr.open_group(path, "a")
+    ad._io.specs.write_elem(f, "X", a)
+    store = AccessTrackingStore(path)
+    store.set_key_trackers(["X/indptr"])
+    f = zarr.open_group(store, "a")
+    a_disk = sparse_dataset(f["X"])
+    a_disk[:1]
+    a_disk[3:5]
+    a_disk[6:7]
+    a_disk[8:9]
+    assert (
+        store.get_access_count("X/indptr") == 2
+    )  # one each for .zarray and actual access
+
+
 @pytest.mark.parametrize(
     ["sparse_format", "a_shape", "b_shape"],
     [
@@ -198,6 +226,21 @@ def test_wrong_shape(
         a_disk.append(b_disk)
 
 
+def test_reset_group(tmp_path: Path):
+    path = tmp_path / "test.zarr"  # diskfmt is either h5ad or zarr
+    base = sparse.random(100, 100, format="csr")
+
+    if diskfmt == "zarr":
+        f = zarr.open_group(path, "a")
+    else:
+        f = h5py.File(path, "a")
+
+    ad._io.specs.write_elem(f, "base", base)
+    disk_mtx = sparse_dataset(f["base"])
+    with pytest.raises(TypeError):
+        disk_mtx.group = f
+
+
 def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]):
     path = (
         tmp_path / f"test.{diskfmt.replace('ad', '')}"

diff --git a/docs/release-notes/0.10.4.md b/docs/release-notes/0.10.4.md
@@ -6,6 +6,7 @@
 * `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko`
 * `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold`
 * `adata.X[mask]` works in newer `numpy` versions when `X` is `backed` {pr}`1255` {user}`ilan-gold`
+* `BaseCompressedSparseDataset`'s `indptr` is cached {pr}`1266` {user}`ilan-gold`
 
 ```{rubric} Documentation
 ```