Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(fix): empty boolean mask on backed sparse matrix #1321

Merged
merged 12 commits into from
Jan 25, 2024
10 changes: 8 additions & 2 deletions anndata/_core/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:

def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix:
idxs = np.asarray(row)
if len(idxs) == 0:
return ss.csr_matrix((0, self.shape[1]))
if idxs.dtype == bool:
idxs = np.where(idxs)
return ss.csr_matrix(
Expand Down Expand Up @@ -214,6 +216,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix:

def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
idxs = np.asarray(col)
if len(idxs) == 0:
return ss.csc_matrix((self.shape[0], 0))
if idxs.dtype == bool:
idxs = np.where(idxs)
return ss.csc_matrix(
Expand Down Expand Up @@ -262,6 +266,8 @@ def get_compressed_vectors_for_slices(
total = (sel[0] - indptr_sels[i][-1]) + total
offsets.append(total)
start_indptr = indptr_sels[0] - offsets[0]
if len(slices) < 2: # there is only one slice so no need to concatenate
return data, indices, start_indptr
end_indptr = np.concatenate(
[s[1:] - offsets[i + 1] for i, s in enumerate(indptr_sels[1:])]
)
Expand Down Expand Up @@ -407,11 +413,11 @@ def __getitem__(self, index: Index | tuple[()]) -> float | ss.spmatrix:
mtx = self._to_backed()

# Handle masked indexing along major axis
if self.format == "csr" and np.array(row).dtype == bool:
if self.format == "csr" and np.array(row).dtype == bool and row.sum() != 0:
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
sub = ss.csr_matrix(
subset_by_major_axis_mask(mtx, row), shape=(row.sum(), mtx.shape[1])
)[:, col]
elif self.format == "csc" and np.array(col).dtype == bool:
elif self.format == "csc" and np.array(col).dtype == bool and col.sum() != 0:
sub = ss.csc_matrix(
subset_by_major_axis_mask(mtx, col), shape=(mtx.shape[0], col.sum())
)[row, :]
Expand Down
86 changes: 61 additions & 25 deletions anndata/tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,24 @@ def callback(func, elem_name, elem, iospec):
return csr_mem, csr_disk, csc_disk, dense_disk


def test_empty_backed_indexing(
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
):
csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata

obs_idx = np.zeros(csr_disk.shape[0], dtype=bool)
var_idx = np.zeros(csr_disk.shape[1], dtype=bool)

assert_equal(csr_mem.X[obs_idx], csr_disk.X[obs_idx])
assert_equal(csr_mem.X[:, var_idx], csc_disk.X[:, var_idx])

# The following do not work because of https://github.com/scipy/scipy/issues/19919
# Our implementation returns a (0,0) sized matrix but scipy does (1,0).

# assert_equal(csr_mem.X[obs_idx, var_idx], csr_disk.X[obs_idx, var_idx])
# assert_equal(csr_mem.X[obs_idx, var_idx], csc_disk.X[obs_idx, var_idx])


def test_backed_indexing(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
subset_func,
Expand All @@ -95,39 +113,57 @@ def test_backed_indexing(
assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X)


# test behavior from https://github.com/scverse/anndata/pull/1233
def test_consecutive_bool(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
):
_, csr_disk, csc_disk, _ = ondisk_equivalent_adata

randomized_mask = np.zeros(csr_disk.shape[0], dtype=bool)
inds = np.random.choice(csr_disk.shape[0], 20, replace=False)
def make_randomized_mask(size: int) -> np.ndarray:
randomized_mask = np.zeros(size, dtype=bool)
inds = np.random.choice(size, 20, replace=False)
inds.sort()
for i in range(0, len(inds) - 1, 2):
randomized_mask[inds[i] : inds[i + 1]] = True
return randomized_mask

# non-random indices, with alternating one false and n true
def make_alternating_mask(n):
mask_alternating = np.ones(csr_disk.shape[0], dtype=bool)
for i in range(0, csr_disk.shape[0], n):
mask_alternating[i] = False
return mask_alternating

alternating_mask = make_alternating_mask(10)
# non-random indices, with alternating one false and n true
def make_alternating_mask(size: int) -> np.ndarray:
mask_alternating = np.ones(size, dtype=bool)
for i in range(0, size, 10): # 10 is enough to trigger new behavior
mask_alternating[i] = False
return mask_alternating


def make_one_group_mask(size: int) -> np.ndarray:
one_group_mask = np.zeros(size, dtype=bool)
one_group_mask[size // 4 : size // 2] = True
return one_group_mask


def make_one_elem_mask(size: int) -> np.ndarray:
one_elem_mask = np.zeros(size, dtype=bool)
one_elem_mask[size // 4] = True
return one_elem_mask


# test behavior from https://github.com/scverse/anndata/pull/1233
@pytest.mark.parametrize(
"make_bool_mask",
[
make_randomized_mask,
make_alternating_mask,
make_one_group_mask,
make_one_elem_mask,
],
ids=["randomized", "alternating", "one_group", "one_elem"],
)
def test_consecutive_bool(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
make_bool_mask: Callable[[int], np.ndarray],
):
_, csr_disk, csc_disk, _ = ondisk_equivalent_adata
mask = make_bool_mask(csr_disk.shape[0])

# indexing needs to be on `X` directly to trigger the optimization.
# `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where`
assert_equal(
csr_disk.X[alternating_mask, :], csr_disk.X[np.where(alternating_mask)]
)
assert_equal(
csc_disk.X[:, alternating_mask], csc_disk.X[:, np.where(alternating_mask)[0]]
)
assert_equal(csr_disk.X[randomized_mask, :], csr_disk.X[np.where(randomized_mask)])
assert_equal(
csc_disk.X[:, randomized_mask], csc_disk.X[:, np.where(randomized_mask)[0]]
)
assert_equal(csr_disk.X[mask, :], csr_disk.X[np.where(mask)])
assert_equal(csc_disk.X[:, mask], csc_disk.X[:, np.where(mask)[0]])


@pytest.mark.parametrize(
Expand Down