Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(fix): cache indptr for backed sparse matrices #1266

Merged
merged 21 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions anndata/_core/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import collections.abc as cabc
import warnings
from abc import ABC
from functools import cached_property
from itertools import accumulate, chain
from typing import TYPE_CHECKING, Literal, NamedTuple

Expand Down Expand Up @@ -282,11 +283,21 @@ class BaseCompressedSparseDataset(ABC):

def __init__(self, group: h5py.Group | ZarrGroup):
type(self)._check_group_format(group)
self.group = group
self._group = group

shape: tuple[int, int]
"""Shape of the matrix."""

@property
def group(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def group(self):
def group(self) -> GroupStorageType:

Is this the change that was giving you problems with the docs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

return self._group
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be okay with this being made public, but could you add a docstring + type hint for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.

Copy link
Contributor Author

@ilan-gold ilan-gold Jan 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the type hint appears to be unnecessary. You'll notice that some of the builds failed because of documentation issues - I honestly don't know why. The other references to Group have no issue, but this one causes things to break. VSCode can infer the type and the docs are fine without it.

Copy link
Contributor Author

@ilan-gold ilan-gold Jan 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I lied. The docs don't work but the type hints do work e.g., in VSCode. I was looking at the wrong Group

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I have no idea why this is breaking. Extremely strange stuff. I will look again tomorrow.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like it's working. Can we resolve?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well I did not push the change, so the docs are building because of that, unless this works for you locally.


@group.setter
def group(self, val):
raise TypeError(
f"Do not reset group on a {type(self)}. Instead use `sparse_dataset` to make a new class."
)
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved

ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
@property
def backend(self) -> Literal["zarr", "hdf5"]:
if isinstance(self.group, ZarrGroup):
Expand Down Expand Up @@ -431,20 +442,24 @@ def append(self, sparse_matrix: ss.spmatrix):
indices.resize((orig_data_size + sparse_matrix.indices.shape[0],))
indices[orig_data_size:] = sparse_matrix.indices

@cached_property
def indptr(self):
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
return self.group["indptr"][...]

ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
def _to_backed(self) -> BackedSparseMatrix:
format_class = get_backed_class(self.format)
mtx = format_class(self.shape, dtype=self.dtype)
mtx.data = self.group["data"]
mtx.indices = self.group["indices"]
mtx.indptr = self.group["indptr"][:]
mtx.indptr = self.indptr
return mtx

def to_memory(self) -> ss.spmatrix:
format_class = get_memory_class(self.format)
mtx = format_class(self.shape, dtype=self.dtype)
mtx.data = self.group["data"][...]
mtx.indices = self.group["indices"][...]
mtx.indptr = self.group["indptr"][...]
mtx.indptr = self.indptr
return mtx


Expand Down
20 changes: 20 additions & 0 deletions anndata/tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np
import pandas as pd
import pytest
import zarr
from pandas.api.types import is_numeric_dtype
from scipy import sparse

Expand Down Expand Up @@ -743,3 +744,22 @@ def shares_memory_sparse(x, y):
marks=pytest.mark.gpu,
),
]


class AccessTrackingStore(zarr.DirectoryStore):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._access_count = {}

def __getitem__(self, key):
for tracked in self._access_count:
if tracked in key:
self._access_count[tracked] += 1
return super().__getitem__(key)

def get_access_count(self, key):
return self._access_count[key]

def set_key_trackers(self, keys_to_track):
for k in keys_to_track:
self._access_count[k] = 0
45 changes: 44 additions & 1 deletion anndata/tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from anndata._core.anndata import AnnData
from anndata._core.sparse_dataset import sparse_dataset
from anndata.experimental import read_dispatched
from anndata.tests.helpers import assert_equal, subset_func
from anndata.tests.helpers import AccessTrackingStore, assert_equal, subset_func

if TYPE_CHECKING:
from pathlib import Path
Expand Down Expand Up @@ -164,6 +164,34 @@ def test_dataset_append_disk(
assert_equal(fromdisk, frommem)


@pytest.mark.parametrize(
["sparse_format"],
[
pytest.param(sparse.csr_matrix),
pytest.param(sparse.csc_matrix),
],
)
def test_indptr_cache(
tmp_path: Path,
sparse_format: Callable[[ArrayLike], sparse.spmatrix],
):
path = tmp_path / "test.zarr" # diskfmt is either h5ad or zarr
a = sparse_format(sparse.random(10, 10))
f = zarr.open_group(path, "a")
ad._io.specs.write_elem(f, "X", a)
store = AccessTrackingStore(path)
store.set_key_trackers(["X/indptr"])
f = zarr.open_group(store, "a")
a_disk = sparse_dataset(f["X"])
a_disk[:1]
a_disk[3:5]
a_disk[6:7]
a_disk[8:9]
assert (
store.get_access_count("X/indptr") == 2
) # one each for .zarray and actual access


@pytest.mark.parametrize(
["sparse_format", "a_shape", "b_shape"],
[
Expand Down Expand Up @@ -198,6 +226,21 @@ def test_wrong_shape(
a_disk.append(b_disk)


def test_reset_group(tmp_path: Path):
path = tmp_path / "test.zarr" # diskfmt is either h5ad or zarr
base = sparse.random(100, 100, format="csr")

if diskfmt == "zarr":
f = zarr.open_group(path, "a")
else:
f = h5py.File(path, "a")

ad._io.specs.write_elem(f, "base", base)
disk_mtx = sparse_dataset(f["base"])
with pytest.raises(TypeError):
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
disk_mtx.group = f


def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
Expand Down
1 change: 1 addition & 0 deletions docs/release-notes/0.10.4.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko`
* `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold`
* `adata.X[mask]` works in newer `numpy` versions when `X` is `backed` {pr}`1255` {user}`ilan-gold`
* `BaseCompressedSparseDataset`'s `indptr` is cached {pr}`1266` {user}`ilan-gold`
ivirshup marked this conversation as resolved.
Show resolved Hide resolved

```{rubric} Documentation
```
Expand Down
Loading