Merge branch 'main' into repeated-cols

scverse · Jan 26, 2024 · bebace0 · bebace0
2 parents b2deb2a + 299ca97
commit bebace0
Show file tree

Hide file tree

Showing 12 changed files with 112 additions and 57 deletions.
diff --git a/.azure-pipelines.yml b/.azure-pipelines.yml
@@ -1,5 +1,6 @@
 trigger:
   - main
+  - "*.*.x"
 
 variables:
   PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -2,7 +2,7 @@ name: Benchmark
 
 on:
   push:
-    branches: [main]
+    branches: [main, "[0-9]+.[0-9]+.x"]
   pull_request:
     branches: [main]
 

diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml
@@ -2,7 +2,7 @@ name: AWS GPU
 
 on:
   push:
-    branches: [main]
+    branches: [main, "[0-9]+.[0-9]+.x"]
   pull_request:
     types:
       - labeled

diff --git a/anndata/_core/index.py b/anndata/_core/index.py
@@ -42,7 +42,7 @@ def _normalize_index(
     | np.integer
     | int
     | str
-    | Sequence[int | np.integer]
+    | Sequence[bool | int | np.integer]
     | np.ndarray
     | pd.Index,
     index: pd.Index,
@@ -80,18 +80,9 @@ def name_idx(i):
                 indexer = indexer.toarray()
             indexer = np.ravel(indexer)
         if not isinstance(indexer, (np.ndarray, pd.Index)):
-            dtype = "int"
-            if (
-                all(isinstance(x, str) for x in indexer) and len(indexer) > 0
-            ):  # if not all, but any, then dtype=int will cause an error
-                dtype = "object"
-            try:
-                indexer = np.array(indexer, dtype=dtype)
-            except ValueError as e:
-                if str(e).startswith("invalid literal for"):
-                    msg = "Mixed type list indexers not supported."
-                    raise ValueError(msg) from e
-                raise e
+            indexer = np.array(indexer)
+            if len(indexer) == 0:
+                indexer = indexer.astype(int)
         if issubclass(indexer.dtype.type, (np.integer, np.floating)):
             return indexer  # Might not work for range indexes
         elif issubclass(indexer.dtype.type, np.bool_):

diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py
@@ -180,6 +180,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:
 
     def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix:
         idxs = np.asarray(row)
+        if len(idxs) == 0:
+            return ss.csr_matrix((0, self.shape[1]))
         if idxs.dtype == bool:
             idxs = np.where(idxs)
         return ss.csr_matrix(
@@ -214,6 +216,8 @@ def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix:
 
     def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
         idxs = np.asarray(col)
+        if len(idxs) == 0:
+            return ss.csc_matrix((self.shape[0], 0))
         if idxs.dtype == bool:
             idxs = np.where(idxs)
         return ss.csc_matrix(
@@ -290,10 +294,12 @@ def mean_slice_length(slices):
         return floor((slices[-1].stop - slices[0].start) / len(slices))
 
     # heuristic for whether slicing should be optimized
-    if mean_slice_length(slices) <= 7:
-        return get_compressed_vectors(mtx, np.where(mask)[0])
-    else:
-        return get_compressed_vectors_for_slices(mtx, slices)
+    if len(slices) > 0:
+        if mean_slice_length(slices) <= 7:
+            return get_compressed_vectors(mtx, np.where(mask)[0])
+        else:
+            return get_compressed_vectors_for_slices(mtx, slices)
+    return [], [], [0]
 
 
 def get_format(data: ss.spmatrix) -> str:

diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py
@@ -12,7 +12,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-import zarr
 from pandas.api.types import is_numeric_dtype
 from scipy import sparse
 
@@ -284,6 +283,10 @@ def array_bool_subset(index, min_size=2):
     return b
 
 
+def list_bool_subset(index, min_size=2):
+    return array_bool_subset(index, min_size=min_size).tolist()
+
+
 def matrix_bool_subset(index, min_size=2):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", PendingDeprecationWarning)
@@ -321,6 +324,10 @@ def array_int_subset(index, min_size=2):
     )
 
 
+def list_int_subset(index, min_size=2):
+    return array_int_subset(index, min_size=min_size).tolist()
+
+
 def slice_subset(index, min_size=2):
     while True:
         points = np.random.choice(np.arange(len(index) + 1), size=2, replace=False)
@@ -340,7 +347,9 @@ def single_subset(index):
         slice_subset,
         single_subset,
         array_int_subset,
+        list_int_subset,
         array_bool_subset,
+        list_bool_subset,
         matrix_bool_subset,
         spmatrix_bool_subset,
     ]
@@ -749,21 +758,30 @@ def shares_memory_sparse(x, y):
     ),
 ]
 
+try:
+    import zarr
+
+    class AccessTrackingStore(zarr.DirectoryStore):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self._access_count = {}
 
-class AccessTrackingStore(zarr.DirectoryStore):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._access_count = {}
+        def __getitem__(self, key):
+            for tracked in self._access_count:
+                if tracked in key:
+                    self._access_count[tracked] += 1
+            return super().__getitem__(key)
 
-    def __getitem__(self, key):
-        for tracked in self._access_count:
-            if tracked in key:
-                self._access_count[tracked] += 1
-        return super().__getitem__(key)
+        def get_access_count(self, key):
+            return self._access_count[key]
 
-    def get_access_count(self, key):
-        return self._access_count[key]
+        def set_key_trackers(self, keys_to_track):
+            for k in keys_to_track:
+                self._access_count[k] = 0
+except ImportError:
 
-    def set_key_trackers(self, keys_to_track):
-        for k in keys_to_track:
-            self._access_count[k] = 0
+    class AccessTrackingStore:
+        def __init__(self, *_args, **_kwargs) -> None:
+            raise ImportError(
+                "zarr must be imported to create an `AccessTrackingStore` instance."
+            )
diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py
@@ -27,6 +27,10 @@ def diskfmt(request):
     return request.param
 
 
+M = 50
+N = 50
+
+
 @pytest.fixture(scope="function")
 def ondisk_equivalent_adata(
     tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]
@@ -37,7 +41,7 @@ def ondisk_equivalent_adata(
 
     write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs)
 
-    csr_mem = ad.AnnData(X=sparse.random(50, 50, format="csr", density=0.1))
+    csr_mem = ad.AnnData(X=sparse.random(M, N, format="csr", density=0.1))
     csc_mem = ad.AnnData(X=csr_mem.X.tocsc())
     dense_mem = ad.AnnData(X=csr_mem.X.toarray())
 
@@ -77,6 +81,25 @@ def callback(func, elem_name, elem, iospec):
     return csr_mem, csr_disk, csc_disk, dense_disk
 
 
+@pytest.mark.parametrize(
+    "empty_mask", [[], np.zeros(M, dtype=bool)], ids=["empty_list", "empty_bool_mask"]
+)
+def test_empty_backed_indexing(
+    ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
+    empty_mask,
+):
+    csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata
+
+    assert_equal(csr_mem.X[empty_mask], csr_disk.X[empty_mask])
+    assert_equal(csr_mem.X[:, empty_mask], csc_disk.X[:, empty_mask])
+
+    # The following do not work because of https://github.com/scipy/scipy/issues/19919
+    # Our implementation returns a (0,0) sized matrix but scipy does (1,0).
+
+    # assert_equal(csr_mem.X[empty_mask, empty_mask], csr_disk.X[empty_mask, empty_mask])
+    # assert_equal(csr_mem.X[empty_mask, empty_mask], csc_disk.X[empty_mask, empty_mask])
+
+
 def test_backed_indexing(
     ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
     subset_func,

diff --git a/anndata/tests/test_hdf5_backing.py b/anndata/tests/test_hdf5_backing.py
@@ -193,8 +193,8 @@ def test_backed_raw_subset(tmp_path, array_type, subset_func, subset_func2):
     var_idx = subset_func2(mem_adata.var_names)
     if (
         array_type is asarray
-        and isinstance(obs_idx, (np.ndarray, sparse.spmatrix))
-        and isinstance(var_idx, (np.ndarray, sparse.spmatrix))
+        and isinstance(obs_idx, (list, np.ndarray, sparse.spmatrix))
+        and isinstance(var_idx, (list, np.ndarray, sparse.spmatrix))
     ):
         pytest.xfail(
             "Fancy indexing does not work with multiple arrays on a h5py.Dataset"

diff --git a/anndata/tests/test_views.py b/anndata/tests/test_views.py
@@ -703,22 +703,22 @@ def test_empty_list_subset():
     assert subset.varm["sparse"].shape == (0, 100)
 
 
-@pytest.mark.parametrize("dim", ["obs", "var"])
-@pytest.mark.parametrize(
-    ("idx", "pat"),
-    [
-        pytest.param(
-            [1, "cell_c"], r"Mixed type list indexers not supported", id="mixed"
-        ),
-        pytest.param(
-            [[1, 2], [2]], r"setting an array element with a sequence", id="nested"
-        ),
-    ],
-)
-def test_subset_errors(dim, idx, pat):
-    orig = gen_adata((10, 10))
-    with pytest.raises(ValueError, match=pat):
-        if dim == "obs":
-            orig[idx, :].X
-        elif dim == "var":
-            orig[:, idx].X
+# @pytest.mark.parametrize("dim", ["obs", "var"])
+# @pytest.mark.parametrize(
+#     ("idx", "pat"),
+#     [
+#         pytest.param(
+#             [1, "cell_c"], r"Mixed type list indexers not supported", id="mixed"
+#         ),
+#         pytest.param(
+#             [[1, 2], [2]], r"setting an array element with a sequence", id="nested"
+#         ),
+#     ],
+# )
+# def test_subset_errors(dim, idx, pat):
+#     orig = gen_adata((10, 10))
+#     with pytest.raises(ValueError, match=pat):
+#         if dim == "obs":
+#             orig[idx, :].X
+#         elif dim == "var":
+#             orig[:, idx].X
diff --git a/docs/release-notes/0.10.5.md b/docs/release-notes/0.10.5.md
@@ -1,11 +1,12 @@
-### 0.10.5 {small}`the future`
+### 0.10.5 {small}`2024-01-25`
 
 ```{rubric} Bugfix
 ```
 
 * Fix outer concatenation along variables when only a subset of objects had an entry in layers {pr}`1291` {user}`ivirshup`
 * Fix comparison of >2d arrays in `uns` during concatenation {pr}`1300` {user}`ivirshup`
 * Fix IO with awkward array version 2.5.2 {pr}`1328` {user}`ivirshup`
+* Fix bug (introduced in 0.10.4) where indexing an AnnData with `list[bool]` would return the wrong result {pr}`1332` {user}`ivirshup`
 
 ```{rubric} Documentation
 ```

diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md
@@ -0,0 +1,12 @@
+### 0.10.6 {small}`the future`
+
+```{rubric} Bugfix
+```
+
+* Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold`
+
+```{rubric} Documentation
+```
+
+```{rubric} Performance
+```
diff --git a/docs/release-notes/release-latest.md b/docs/release-notes/release-latest.md
@@ -5,6 +5,9 @@
 
 ## Version 0.10
 
+```{include} /release-notes/0.10.6.md
+```
+
 ```{include} /release-notes/0.10.5.md
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ name: AWS GPU @@
     on:
       push:
-        branches: [main]
+        branches: [main, "[0-9]+.[0-9]+.x"]
       pull_request:
         types:
           - labeled
@@ Expand Down @@