laminlabs · sunnyosun · Aug 6, 2024 · Aug 5, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/docs/curate-df.ipynb b/docs/curate-df.ipynb
@@ -75,12 +75,15 @@
    },
    "outputs": [],
    "source": [
-    "df = pd.DataFrame({\n",
-    "    \"temperature\": [37.2, 36.3, 38.2],\n",
-    "    \"cell_type\": [\"cerebral pyramidal neuron\", \"astrocyte\", \"oligodendrocyte\"],\n",
-    "    \"assay_ontology_id\": [\"EFO:0008913\", \"EFO:0008913\", \"EFO:0008913\"],\n",
-    "    \"donor\": [\"D0001\", \"D0002\", \"DOOO3\"],\n",
-    "})\n",
+    "df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"temperature\": [37.2, 36.3, 38.2],\n",
+    "        \"cell_type\": [\"cerebral pyramidal neuron\", \"astrocyte\", \"oligodendrocyte\"],\n",
+    "        \"assay_ontology_id\": [\"EFO:0008913\", \"EFO:0008913\", \"EFO:0008913\"],\n",
+    "        \"donor\": [\"D0001\", \"D0002\", \"DOOO3\"]\n",
+    "    },\n",
+    "    index = [\"obs1\", \"obs2\", \"obs3\"]\n",
+    ")\n",
     "df"
    ]
   },
@@ -109,8 +112,24 @@
     "    \"cell_type\": bt.CellType.name,\n",
     "    \"assay_ontology_id\": bt.ExperimentalFactor.ontology_id,\n",
     "    \"donor\": ln.ULabel.name,\n",
-    "}\n",
-    "\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c293ddf",
+   "metadata": {},
+   "source": [
+    "Now, initialize a Curate object by passing data and its validation criteria:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78424b30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# create an Curate object to guide validation and annotation\n",
     "# this object will use our DataFrame and the defined categorical criteria\n",
     "curate = ln.Curate.from_df(df, categoricals=categoricals)"
@@ -150,17 +169,6 @@
     "Because our current database instance is empty, here, we'll add values to the registries defined in the validation criteria."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "65bb5e39",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# this adds assays that were validated (via a public ontology)\n",
-    "curate.add_validated_from(\"assay_ontology_id\")"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -172,8 +180,8 @@
    },
    "outputs": [],
    "source": [
-    "# this adds cell types that were _not_ validated\n",
-    "curate.add_new_from(\"cell_type\")"
+    "# this adds cell types that were validated (via a public ontology)\n",
+    "curate.add_validated_from(\"cell_type\")"
    ]
   },
   {
@@ -220,10 +228,25 @@
    "source": [
     "# curate the cell type\n",
     "df.cell_type = df.cell_type.replace({\"cerebral pyramidal neuron\": cell_types.cerebral_cortex_pyramidal_neuron.name})\n",
-    "# register validated cell types\n",
+    "# now register curated and validated cell types\n",
     "curate.add_validated_from(df.cell_type.name)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ede057e8",
+   "metadata": {
+    "tags": [
+     "hide-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# this adds assays that were validated (via a public ontology)\n",
+    "curate.add_validated_from(\"assay_ontology_id\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -235,7 +258,7 @@
    },
    "outputs": [],
    "source": [
-    "# register non-validated donors\n",
+    "# this adds donors that were _not_ validated\n",
     "curate.add_new_from(df.donor.name)"
    ]
   },
@@ -262,7 +285,7 @@
    "source": [
     "## Validate an AnnData object\n",
     "\n",
-    "Here we addtionally specify which `var_fields` to validate against."
+    "Here we addtionally specify which `var_index` to validate against."
    ]
   },
   {
@@ -276,9 +299,17 @@
    },
    "outputs": [],
    "source": [
-    "df.index = [\"obs1\", \"obs2\", \"obs3\"]\n",
-    "\n",
-    "X = pd.DataFrame({\"TCF7\": [1, 2, 3], \"PDCD1\": [4, 5, 6], \"CD3E\": [7, 8, 9], \"CD4\": [10, 11, 12], \"CD8A\": [13, 14, 15]}, index=[\"obs1\", \"obs2\", \"obs3\"])\n",
+    "X = pd.DataFrame(\n",
+    "    {\n",
+    "        \"ENSG00000081059\": [1, 2, 3], \n",
+    "        \"ENSG00000276977\": [4, 5, 6], \n",
+    "        \"ENSG00000198851\": [7, 8, 9], \n",
+    "        \"ENSG00000010610\": [10, 11, 12], \n",
+    "        \"ENSG00000153563\": [13, 14, 15],\n",
+    "        \"corrupted\": [16, 17, 18]\n",
+    "    }, \n",
+    "    index=df.index\n",
+    ")\n",
     "\n",
     "adata = ad.AnnData(X=X, obs=df)\n",
     "adata"
@@ -297,7 +328,7 @@
    "source": [
     "curate = ln.Curate.from_anndata(\n",
     "    adata, \n",
-    "    var_index=bt.Gene.symbol,\n",
+    "    var_index=bt.Gene.ensembl_gene_id,  # validate var.index against Gene.ensembl_gene_id\n",
     "    categoricals=categoricals, \n",
     "    organism=\"human\",\n",
     ")"
@@ -317,31 +348,76 @@
     "curate.validate()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "a8a7a653",
+   "metadata": {},
+   "source": [
+    "## Curate data object to pass validation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "614545ea",
+   "metadata": {},
+   "source": [
+    "Non-validated terms can be accessed via:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7877c058",
-   "metadata": {
-    "tags": [
-     "hide-output"
-    ]
-   },
+   "id": "4e2fd290",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "curate.add_validated_from(\"all\")"
+    "curate.non_validated"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2ec4aeb",
+   "metadata": {},
+   "source": [
+    "Subset anndata object to validated genes only:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c47e4b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "adata_validated = adata[:, ~adata.var.index.isin(curate.non_validated[\"var_index\"])].copy()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1b0b2b10",
+   "metadata": {},
+   "source": [
+    "Now let's validate the subsetted object:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0e139bda",
+   "id": "6ac64695",
    "metadata": {
     "tags": [
      "hide-output"
     ]
    },
    "outputs": [],
    "source": [
+    "curate = ln.Curate.from_anndata(\n",
+    "    adata_validated, \n",
+    "    var_index=bt.Gene.ensembl_gene_id,  # validate var.index against Gene.ensembl_gene_id\n",
+    "    categoricals=categoricals, \n",
+    "    organism=\"human\",\n",
+    ")\n",
+    "\n",
     "curate.validate()"
    ]
   },
@@ -431,7 +507,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.1.-1"
+   "version": "3.10.13"
   },
   "nbproject": {
    "id": "WOK3vP0bNGLx",

diff --git a/docs/introduction.ipynb b/docs/introduction.ipynb
@@ -804,6 +804,7 @@
     "    categoricals={adata.obs.perturbation.name: ln.ULabel.name}, \n",
     "    organism=\"human\",  # specify the organism for the Gene registry\n",
     ")\n",
+    "curate.add_validated_from_var_index()\n",
     "curate.validate()\n",
     "\n",
     "# save curated artifact\n",