From 2ab87c47c064e2dec5c43f263360efc365c3c891 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 10 Dec 2024 15:53:23 +0100 Subject: [PATCH] Backport PR #1790: (performance): speedup for `zarr`-based sparse indexing --- docs/release-notes/1790.performance.md | 1 + src/anndata/_core/sparse_dataset.py | 42 +++++++++++++++++--------- 2 files changed, 28 insertions(+), 15 deletions(-) create mode 100644 docs/release-notes/1790.performance.md diff --git a/docs/release-notes/1790.performance.md b/docs/release-notes/1790.performance.md new file mode 100644 index 000000000..deb160f07 --- /dev/null +++ b/docs/release-notes/1790.performance.md @@ -0,0 +1 @@ +Batch slice-based indexing in {class}`anndata.abc.CSRDataset` and {class}`anndata.abc.CSCDataset` for performance boost in `zarr` {user}`ilan-gold` diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index a155352af..6099a1765 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -16,7 +16,7 @@ from abc import ABC from collections.abc import Iterable from functools import cached_property -from itertools import accumulate, chain +from itertools import accumulate, chain, pairwise from math import floor from pathlib import Path from typing import TYPE_CHECKING, NamedTuple @@ -250,30 +250,42 @@ def slice_as_int(s: slice, l: int) -> int: def get_compressed_vectors( x: BackedSparseMatrix, row_idxs: Iterable[int] ) -> tuple[Sequence, Sequence, Sequence]: - slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs] - data = np.concatenate([x.data[s] for s in slices]) - indices = np.concatenate([x.indices[s] for s in slices]) - indptr = list(accumulate(chain((0,), (s.stop - s.start for s in slices)))) + indptr_slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs] + # HDF5 cannot handle out-of-order integer indexing + if isinstance(x.data, ZarrArray): + as_np_indptr = np.concatenate( + [np.arange(s.start, s.stop) for s in indptr_slices] + ) + data = x.data[as_np_indptr] + indices = x.indices[as_np_indptr] + else: + data = np.concatenate([x.data[s] for s in indptr_slices]) + indices = np.concatenate([x.indices[s] for s in indptr_slices]) + indptr = list(accumulate(chain((0,), (s.stop - s.start for s in indptr_slices)))) return data, indices, indptr def get_compressed_vectors_for_slices( x: BackedSparseMatrix, slices: Iterable[slice] ) -> tuple[Sequence, Sequence, Sequence]: - indptr_sels = [x.indptr[slice(s.start, s.stop + 1)] for s in slices] - data = np.concatenate([x.data[s[0] : s[-1]] for s in indptr_sels]) - indices = np.concatenate([x.indices[s[0] : s[-1]] for s in indptr_sels]) + indptr_indices = [x.indptr[slice(s.start, s.stop + 1)] for s in slices] + indptr_limits = [slice(i[0], i[-1]) for i in indptr_indices] + # HDF5 cannot handle out-of-order integer indexing + if isinstance(x.data, ZarrArray): + indptr_int = np.concatenate([np.arange(s.start, s.stop) for s in indptr_limits]) + data = x.data[indptr_int] + indices = x.indices[indptr_int] + else: + data = np.concatenate([x.data[s] for s in indptr_limits]) + indices = np.concatenate([x.indices[s] for s in indptr_limits]) # Need to track the size of the gaps in the slices to each indptr subselection - total = indptr_sels[0][0] - offsets = [total] - for i, sel in enumerate(indptr_sels[1:]): - total = (sel[0] - indptr_sels[i][-1]) + total - offsets.append(total) - start_indptr = indptr_sels[0] - offsets[0] + gaps = (s1.start - s0.stop for s0, s1 in pairwise(indptr_limits)) + offsets = accumulate(chain([indptr_limits[0].start], gaps)) + start_indptr = indptr_indices[0] - next(offsets) if len(slices) < 2: # there is only one slice so no need to concatenate return data, indices, start_indptr end_indptr = np.concatenate( - [s[1:] - offsets[i + 1] for i, s in enumerate(indptr_sels[1:])] + [s[1:] - o for s, o in zip(indptr_indices[1:], offsets)] ) indptr = np.concatenate([start_indptr, end_indptr]) return data, indices, indptr