Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip labels before loop #394

Merged
merged 6 commits into from
Aug 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 58 additions & 58 deletions scib/metrics/kbet.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,39 +107,72 @@ def kBET(
# set upper bound for k0
size_max = 2**31 - 1

# check if neighborhood size too small or only one batch in subset
counts = adata_tmp.obs.groupby(label_key).agg(
{label_key: "count", batch_key: "nunique"}
)
labels = counts.query(f"{label_key}>=10 and {batch_key} > 1").index
skipped = counts.index.difference(labels)
print(f"{len(skipped)} labels consist of a single batch or is too small. Skip.")
# prepare call of kBET per cluster
kBET_scores = {"cluster": [], "kBET": []}
for clus in adata_tmp.obs[label_key].unique():
kBET_scores = {"cluster": list(skipped), "kBET": [np.nan] * len(skipped)}
for clus in labels:

# subset by label
adata_sub = adata_tmp[adata_tmp.obs[label_key] == clus, :].copy()

# check if neighborhood size too small or only one batch in subset
if np.logical_or(
adata_sub.n_obs < 10, len(adata_sub.obs[batch_key].cat.categories) == 1
):
print(f"{clus} consists of a single batch or is too small. Skip.")
score = np.nan
else:
quarter_mean = np.floor(
np.mean(adata_sub.obs[batch_key].value_counts()) / 4
).astype("int")
k0 = np.min([70, np.max([10, quarter_mean])])
# check k0 for reasonability
if k0 * adata_sub.n_obs >= size_max:
k0 = np.floor(size_max / adata_sub.n_obs).astype("int")
quarter_mean = np.floor(
np.mean(adata_sub.obs[batch_key].value_counts()) / 4
).astype("int")
k0 = np.min([70, np.max([10, quarter_mean])])
# check k0 for reasonability
if k0 * adata_sub.n_obs >= size_max:
k0 = np.floor(size_max / adata_sub.n_obs).astype("int")

matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))
matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))

if verbose:
print(f"Use {k0} nearest neighbors.")
n_comp, labs = scipy.sparse.csgraph.connected_components(
adata_sub.obsp["connectivities"], connection="strong"
if verbose:
print(f"Use {k0} nearest neighbors.")
n_comp, labs = scipy.sparse.csgraph.connected_components(
adata_sub.obsp["connectivities"], connection="strong"
)

if n_comp == 1: # a single component to compute kBET on
try:
nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float")
# call kBET
score = kBET_single(
matrix=matrix,
batch=adata_sub.obs[batch_key],
knn=nn_index_tmp
+ 1, # nn_index in python is 0-based and 1-based in R
verbose=verbose,
k0=k0,
)
except NeighborsError:
print("Not enough neighbours")
score = 1 # i.e. 100% rejection

else:
# check the number of components where kBET can be computed upon
comp_size = pd.value_counts(labs)
# check which components are small
comp_size_thresh = 3 * k0
idx_nonan = np.flatnonzero(
np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
)

if n_comp == 1: # a single component to compute kBET on
# check if 75% of all cells can be used for kBET run
if len(idx_nonan) / len(labs) >= 0.75:
# create another subset of components, assume they are not visited in a diffusion process
adata_sub_sub = adata_sub[idx_nonan, :].copy()
nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
nn_index_tmp[:] = np.nan

try:
nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float")
nn_index_tmp[idx_nonan] = diffusion_nn(adata_sub_sub, k=k0).astype(
"float"
)
# call kBET
score = kBET_single(
matrix=matrix,
Expand All @@ -152,41 +185,8 @@ def kBET(
except NeighborsError:
print("Not enough neighbours")
score = 1 # i.e. 100% rejection

else:
# check the number of components where kBET can be computed upon
comp_size = pd.value_counts(labs)
# check which components are small
comp_size_thresh = 3 * k0
idx_nonan = np.flatnonzero(
np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
)

# check if 75% of all cells can be used for kBET run
if len(idx_nonan) / len(labs) >= 0.75:
# create another subset of components, assume they are not visited in a diffusion process
adata_sub_sub = adata_sub[idx_nonan, :].copy()
nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
nn_index_tmp[:] = np.nan

try:
nn_index_tmp[idx_nonan] = diffusion_nn(
adata_sub_sub, k=k0
).astype("float")
# call kBET
score = kBET_single(
matrix=matrix,
batch=adata_sub.obs[batch_key],
knn=nn_index_tmp
+ 1, # nn_index in python is 0-based and 1-based in R
verbose=verbose,
k0=k0,
)
except NeighborsError:
print("Not enough neighbours")
score = 1 # i.e. 100% rejection
else: # if there are too many too small connected components, set kBET score to 1
score = 1 # i.e. 100% rejection
else: # if there are too many too small connected components, set kBET score to 1
score = 1 # i.e. 100% rejection

kBET_scores["cluster"].append(clus)
kBET_scores["kBET"].append(score)
Expand Down
Loading