Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✅ More tests for Artifact.df() #2241

Merged
merged 3 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions docs/registries.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -620,9 +620,49 @@
},
"outputs": [],
"source": [
"ln.Artifact.filter().order_by(\"created_at\").df()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec39c7ff",
"metadata": {
"tags": [
"hide-output"
]
},
"outputs": [],
"source": [
"# reverse ordering\n",
"ln.Artifact.filter().order_by(\"-created_at\").df()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9830119",
"metadata": {
"tags": [
"hide-output"
]
},
"outputs": [],
"source": [
"ln.Artifact.filter().order_by(\"key\").df()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a705b47",
"metadata": {},
"outputs": [],
"source": [
"# reverse ordering\n",
"ln.Artifact.filter().order_by(\"-key\").df()"
]
},
{
"attachments": {},
"cell_type": "markdown",
Expand Down
File renamed without changes.
120 changes: 107 additions & 13 deletions tests/core/test_curate_annotate_df.py → tests/core/test_describe_df.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,53 @@
import anndata as ad
import bionty as bt
import lamindb as ln
import numpy as np
import pandas as pd
from lamindb.core import datasets
from lamindb.core._data import _describe_postgres
from lamindb.core.datasets import small_dataset1


def test_curate_annotate_df():
## Define the schema of the dataset & its metadata
def check_df_equality(actual_df: pd.DataFrame, expected_df: pd.DataFrame):
"""Checks equality between two DataFrames.

Special handling for columns containing sets and NaN values.
"""
# do not test indices by default
# pd.testing.assert_index_equal(actual_df.index, expected_df.index)
expected_df.index = actual_df.index
assert set(actual_df.columns) == set(expected_df.columns)
for col in expected_df.columns:
# Detect if column contains sets by checking first non-null value
first_value = next((v for v in expected_df[col] if pd.notna(v)), None)
is_set_column = isinstance(first_value, set)
if is_set_column:
# For set columns, compare sets with NaN handling
for idx in expected_df.index:
actual_val = actual_df.loc[idx, col]
expected_val = expected_df.loc[idx, col]
# If both are NaN, they're equal
if pd.isna(actual_val) and pd.isna(expected_val):
continue
# If one is NaN and the other isn't, they're not equal
if pd.isna(actual_val) != pd.isna(expected_val):
raise AssertionError(f"NaN mismatch at index {idx} in column {col}")
# If neither is NaN, compare the sets
assert (
actual_val == expected_val
), f"Set mismatch at index {idx} in column {col}"
else:
pd.testing.assert_series_equal(
actual_df[col],
expected_df[col],
check_names=False, # ignore series names
)
return True


# parallels the `registries` guide
# please also see the test_querset.py tests
def test_curate_df():
## Create a more complex case
# observation-level metadata
ln.Feature(name="cell_medium", dtype="cat[ULabel]").save()
ln.Feature(name="sample_note", dtype="str").save()
Expand All @@ -19,20 +58,17 @@ def test_curate_annotate_df():
ln.Feature(name="study", dtype="cat[ULabel]").save()
ln.Feature(name="date_of_study", dtype="date").save()
ln.Feature(name="study_note", dtype="str").save()

## Register permissible values for categoricals
## Permissible values for categoricals
ln.ULabel.from_values(["DMSO", "IFNG"], create=True).save()
ln.ULabel.from_values(
["Candidate marker study 1", "Candidate marker study 2"], create=True
).save()
bt.CellType.from_values(["B cell", "T cell"], create=True).save()

## Ingest a dataset
dataset_ad = small_dataset1(format="anndata")

# curate dataset
## Ingest dataset1
adata = datasets.small_dataset1(format="anndata")
curator = ln.Curator.from_anndata(
dataset_ad,
adata,
var_index=bt.Gene.symbol,
categoricals={
"cell_medium": ln.ULabel.name,
Expand All @@ -42,13 +78,70 @@ def test_curate_annotate_df():
organism="human",
)
artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
# annotate with dataset-level features
artifact.features.add_values(dataset_ad.uns)
artifact.features.add_values(adata.uns)

# Ingest dataset2
adata2 = datasets.small_dataset2(format="anndata")
curator = ln.Curator.from_anndata(
adata2,
var_index=bt.Gene.symbol,
categoricals={
"cell_medium": ln.ULabel.name,
"cell_type_by_model": bt.CellType.name,
},
organism="human",
)
artifact2 = curator.save_artifact(key="example_datasets/dataset2.h5ad")
artifact2.features.add_values(adata2.uns)

# Test df(include=[...])
df = (
ln.Artifact.filter(key__startswith="example_datasets/dataset", suffix=".h5ad")
.order_by("-key")
.df(include=["feature_sets__hash", "feature_sets__name"])
.drop(["uid"], axis=1)
)
expected_data = {
"key": ["example_datasets/dataset2.h5ad", "example_datasets/dataset1.h5ad"],
"description": [None, None],
"feature_sets__hash": [
set(artifact2.feature_sets.all().values_list("hash", flat=True)),
set(artifact.feature_sets.all().values_list("hash", flat=True)),
],
"feature_sets__name": [{None}, {None}],
}
expected_df = pd.DataFrame(expected_data)
check_df_equality(df, expected_df)

# Test df(features=True)
df = (
ln.Artifact.filter(key__startswith="example_datasets/dataset", suffix=".h5ad")
.order_by("-key")
.df(features=True)
.drop(["uid"], axis=1)
)
expected_data = {
"key": ["example_datasets/dataset2.h5ad", "example_datasets/dataset1.h5ad"],
"description": [None, None],
"cell_type_by_expert": [np.nan, {"T cell", "B cell"}],
"cell_type_by_model": [{"T cell", "B cell"}, {"T cell", "B cell"}],
"study": [{"Candidate marker study 2"}, {"Candidate marker study 1"}],
"cell_medium": [{"IFNG", "DMSO"}, {"IFNG", "DMSO"}],
"temperature": [{21.6}, np.nan],
"study_note": [
{
"We had a great time performing this study and the results look compelling."
},
np.nan,
],
"date_of_study": [{"2024-12-01"}, np.nan],
}
expected_df = pd.DataFrame(expected_data)
check_df_equality(df, expected_df)

# expected output has italicized elements that can't be tested
# hence testing is restricted to section content, not headings
description_tree = _describe_postgres(artifact, print_types=True)
print(description_tree)

# general section
assert len(description_tree.children) == 3
Expand Down Expand Up @@ -138,6 +231,7 @@ def test_curate_annotate_df():
]

artifact.delete(permanent=True)
artifact2.delete(permanent=True)
ln.FeatureSet.filter().delete()
ln.Feature.filter().delete()
bt.Gene.filter().delete()
Expand Down
1 change: 1 addition & 0 deletions tests/core/test_queryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from lnschema_core.users import current_user_id


# please also see the test_curate_df.py tests
def test_df():
project_label = ln.ULabel(name="project").save()
project_names = [f"Project {i}" for i in range(3)]
Expand Down