diff --git a/docs/release-notes/3426.bugfix.md b/docs/release-notes/3426.bugfix.md new file mode 100644 index 0000000000..4565f1ee35 --- /dev/null +++ b/docs/release-notes/3426.bugfix.md @@ -0,0 +1 @@ +Fix {func}`~scanpy.tl.rank_genes_groups` compatibility with data >10M cells {smaller}`P Angerer` diff --git a/src/scanpy/tools/_rank_genes_groups.py b/src/scanpy/tools/_rank_genes_groups.py index 2c214fcfdd..cafb78c6f1 100644 --- a/src/scanpy/tools/_rank_genes_groups.py +++ b/src/scanpy/tools/_rank_genes_groups.py @@ -2,7 +2,6 @@ from __future__ import annotations -from math import floor from typing import TYPE_CHECKING, Literal import numpy as np @@ -32,6 +31,8 @@ # Used with get_literal_vals _Method = Literal["logreg", "t-test", "wilcoxon", "t-test_overestim_var"] +_CONST_MAX_SIZE = 10000000 + def _select_top_n(scores: NDArray, n_top: int): n_from = scores.shape[0] @@ -47,9 +48,7 @@ def _ranks( X: np.ndarray | sparse.csr_matrix | sparse.csc_matrix, mask_obs: NDArray[np.bool_] | None = None, mask_obs_rest: NDArray[np.bool_] | None = None, -): - CONST_MAX_SIZE = 10000000 - +) -> Generator[tuple[pd.DataFrame, int, int], None, None]: n_genes = X.shape[1] if issparse(X): @@ -71,7 +70,7 @@ def _ranks( get_chunk = lambda X, left, right: adapt(X[:, left:right]) # Calculate chunk frames - max_chunk = floor(CONST_MAX_SIZE / n_cells) + max_chunk = max(_CONST_MAX_SIZE // n_cells, 1) for left in range(0, n_genes, max_chunk): right = min(left + max_chunk, n_genes) @@ -81,7 +80,7 @@ def _ranks( yield ranks, left, right -def _tiecorrect(ranks): +def _tiecorrect(ranks: pd.DataFrame) -> np.float64: size = np.float64(ranks.shape[0]) if size < 2: return np.repeat(ranks.shape[1], 1.0) diff --git a/tests/test_rank_genes_groups.py b/tests/test_rank_genes_groups.py index a36e6b14f1..788c7e705d 100644 --- a/tests/test_rank_genes_groups.py +++ b/tests/test_rank_genes_groups.py @@ -307,6 +307,13 @@ def test_wilcoxon_tie_correction(reference): np.testing.assert_allclose(test_obj.stats[groups[0]]["pvals"], pvals) +def test_wilcoxon_huge_data(monkeypatch): + max_size = 300 + adata = pbmc68k_reduced() + monkeypatch.setattr(sc.tl._rank_genes_groups, "_CONST_MAX_SIZE", max_size) + rank_genes_groups(adata, groupby="bulk_labels", method="wilcoxon") + + @pytest.mark.parametrize( ("n_genes_add", "n_genes_out_add"), [pytest.param(0, 0, id="equal"), pytest.param(2, 1, id="more")],