laminlabs · falexwolf · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/docs/registries.ipynb b/docs/registries.ipynb
@@ -620,9 +620,49 @@
    },
    "outputs": [],
    "source": [
+    "ln.Artifact.filter().order_by(\"created_at\").df()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec39c7ff",
+   "metadata": {
+    "tags": [
+     "hide-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# reverse ordering\n",
     "ln.Artifact.filter().order_by(\"-created_at\").df()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9830119",
+   "metadata": {
+    "tags": [
+     "hide-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "ln.Artifact.filter().order_by(\"key\").df()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a705b47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reverse ordering\n",
+    "ln.Artifact.filter().order_by(\"-key\").df()"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",

diff --git a/tests/core/test_curate.py → tests/core/test_curator.py b/tests/core/test_curate.py → tests/core/test_curator.py
diff --git a/tests/core/test_curate_annotate_df.py → tests/core/test_describe_df.py b/tests/core/test_curate_annotate_df.py → tests/core/test_describe_df.py
@@ -1,14 +1,53 @@
 import anndata as ad
 import bionty as bt
 import lamindb as ln
+import numpy as np
 import pandas as pd
+from lamindb.core import datasets
 from lamindb.core._data import _describe_postgres
-from lamindb.core.datasets import small_dataset1
 
 
-def test_curate_annotate_df():
-    ## Define the schema of the dataset & its metadata
+def check_df_equality(actual_df: pd.DataFrame, expected_df: pd.DataFrame):
+    """Checks equality between two DataFrames.
 
+    Special handling for columns containing sets and NaN values.
+    """
+    # do not test indices by default
+    # pd.testing.assert_index_equal(actual_df.index, expected_df.index)
+    expected_df.index = actual_df.index
+    assert set(actual_df.columns) == set(expected_df.columns)
+    for col in expected_df.columns:
+        # Detect if column contains sets by checking first non-null value
+        first_value = next((v for v in expected_df[col] if pd.notna(v)), None)
+        is_set_column = isinstance(first_value, set)
+        if is_set_column:
+            # For set columns, compare sets with NaN handling
+            for idx in expected_df.index:
+                actual_val = actual_df.loc[idx, col]
+                expected_val = expected_df.loc[idx, col]
+                # If both are NaN, they're equal
+                if pd.isna(actual_val) and pd.isna(expected_val):
+                    continue
+                # If one is NaN and the other isn't, they're not equal
+                if pd.isna(actual_val) != pd.isna(expected_val):
+                    raise AssertionError(f"NaN mismatch at index {idx} in column {col}")
+                # If neither is NaN, compare the sets
+                assert (
+                    actual_val == expected_val
+                ), f"Set mismatch at index {idx} in column {col}"
+        else:
+            pd.testing.assert_series_equal(
+                actual_df[col],
+                expected_df[col],
+                check_names=False,  # ignore series names
+            )
+    return True
+
+
+# parallels the `registries` guide
+# please also see the test_querset.py tests
+def test_curate_df():
+    ## Create a more complex case
     # observation-level metadata
     ln.Feature(name="cell_medium", dtype="cat[ULabel]").save()
     ln.Feature(name="sample_note", dtype="str").save()
@@ -19,20 +58,17 @@ def test_curate_annotate_df():
     ln.Feature(name="study", dtype="cat[ULabel]").save()
     ln.Feature(name="date_of_study", dtype="date").save()
     ln.Feature(name="study_note", dtype="str").save()
-
-    ## Register permissible values for categoricals
+    ## Permissible values for categoricals
     ln.ULabel.from_values(["DMSO", "IFNG"], create=True).save()
     ln.ULabel.from_values(
         ["Candidate marker study 1", "Candidate marker study 2"], create=True
     ).save()
     bt.CellType.from_values(["B cell", "T cell"], create=True).save()
 
-    ## Ingest a dataset
-    dataset_ad = small_dataset1(format="anndata")
-
-    # curate dataset
+    ## Ingest dataset1
+    adata = datasets.small_dataset1(format="anndata")
     curator = ln.Curator.from_anndata(
-        dataset_ad,
+        adata,
         var_index=bt.Gene.symbol,
         categoricals={
             "cell_medium": ln.ULabel.name,
@@ -42,13 +78,70 @@ def test_curate_annotate_df():
         organism="human",
     )
     artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
-    # annotate with dataset-level features
-    artifact.features.add_values(dataset_ad.uns)
+    artifact.features.add_values(adata.uns)
+
+    # Ingest dataset2
+    adata2 = datasets.small_dataset2(format="anndata")
+    curator = ln.Curator.from_anndata(
+        adata2,
+        var_index=bt.Gene.symbol,
+        categoricals={
+            "cell_medium": ln.ULabel.name,
+            "cell_type_by_model": bt.CellType.name,
+        },
+        organism="human",
+    )
+    artifact2 = curator.save_artifact(key="example_datasets/dataset2.h5ad")
+    artifact2.features.add_values(adata2.uns)
+
+    # Test df(include=[...])
+    df = (
+        ln.Artifact.filter(key__startswith="example_datasets/dataset", suffix=".h5ad")
+        .order_by("-key")
+        .df(include=["feature_sets__hash", "feature_sets__name"])
+        .drop(["uid"], axis=1)
+    )
+    expected_data = {
+        "key": ["example_datasets/dataset2.h5ad", "example_datasets/dataset1.h5ad"],
+        "description": [None, None],
+        "feature_sets__hash": [
+            set(artifact2.feature_sets.all().values_list("hash", flat=True)),
+            set(artifact.feature_sets.all().values_list("hash", flat=True)),
+        ],
+        "feature_sets__name": [{None}, {None}],
+    }
+    expected_df = pd.DataFrame(expected_data)
+    check_df_equality(df, expected_df)
+
+    # Test df(features=True)
+    df = (
+        ln.Artifact.filter(key__startswith="example_datasets/dataset", suffix=".h5ad")
+        .order_by("-key")
+        .df(features=True)
+        .drop(["uid"], axis=1)
+    )
+    expected_data = {
+        "key": ["example_datasets/dataset2.h5ad", "example_datasets/dataset1.h5ad"],
+        "description": [None, None],
+        "cell_type_by_expert": [np.nan, {"T cell", "B cell"}],
+        "cell_type_by_model": [{"T cell", "B cell"}, {"T cell", "B cell"}],
+        "study": [{"Candidate marker study 2"}, {"Candidate marker study 1"}],
+        "cell_medium": [{"IFNG", "DMSO"}, {"IFNG", "DMSO"}],
+        "temperature": [{21.6}, np.nan],
+        "study_note": [
+            {
+                "We had a great time performing this study and the results look compelling."
+            },
+            np.nan,
+        ],
+        "date_of_study": [{"2024-12-01"}, np.nan],
+    }
+    expected_df = pd.DataFrame(expected_data)
+    check_df_equality(df, expected_df)
 
     # expected output has italicized elements that can't be tested
     # hence testing is restricted to section content, not headings
     description_tree = _describe_postgres(artifact, print_types=True)
-    print(description_tree)
 
     # general section
     assert len(description_tree.children) == 3
@@ -138,6 +231,7 @@ def test_curate_annotate_df():
     ]
 
     artifact.delete(permanent=True)
+    artifact2.delete(permanent=True)
     ln.FeatureSet.filter().delete()
     ln.Feature.filter().delete()
     bt.Gene.filter().delete()

diff --git a/tests/core/test_queryset.py b/tests/core/test_queryset.py
@@ -7,6 +7,7 @@
 from lnschema_core.users import current_user_id
 
 
+# please also see the test_curate_df.py tests
 def test_df():
     project_label = ln.ULabel(name="project").save()
     project_names = [f"Project {i}" for i in range(3)]