From 52492e743f29ec14d7668aff70aeec28fcc32e89 Mon Sep 17 00:00:00 2001 From: Sunny Sun <38218185+sunnyosun@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:57:10 +0100 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Add=20`.standardize()`=20t?= =?UTF-8?q?o=20`Curator`=20and=20refactor=20(#2186)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zethson Co-authored-by: zethson Co-authored-by: Alex Wolf --- docs/curate-df.ipynb | 123 ++++++- lamindb/_curate.py | 707 +++++++++++++++++++++----------------- lamindb/_from_values.py | 6 +- lamindb/core/_django.py | 5 +- tests/core/test_curate.py | 140 +++++++- 5 files changed, 620 insertions(+), 361 deletions(-) diff --git a/docs/curate-df.ipynb b/docs/curate-df.ipynb index f2f960071..02abd4cb4 100644 --- a/docs/curate-df.ipynb +++ b/docs/curate-df.ipynb @@ -7,19 +7,76 @@ "source": [ "# Curate DataFrames and AnnDatas\n", "\n", - "Curating datasets typically means three things:\n", + "Curating a dataset with LaminDB means three things:\n", "\n", - "1. Validate: ensure a dataset meets predefined _validation criteria_\n", - "2. Standardize: transform a dataset so that it meets validation criteria, e.g., by fixing typos or using standardized identifiers\n", - "3. Annotate: link a dataset against metadata records\n", + "1. **Validate:** ensure the dataset meets predefined _validation criteria_\n", + "2. **Standardize:** transform the dataset so that it meets validation criteria, e.g., by fixing typos or using standard instead of ad hoc identifiers\n", + "3. **Annotate:** link the dataset against validated metadata so that it becomes queryable\n", "\n", - "In LaminDB, valid metadata is metadata that's stored in a metadata registry and _validation criteria_ merely defines a mapping onto a field of a registry.\n", + "If a dataset passes validation, curating it takes two lines of code:\n", "\n", - "```{admonition} Example\n", + "```python\n", + "curator = ln.Curator.from_df(df, ...) # create a Curator and pass criteria in \"...\"\n", + "curator.save_artifact() # validates the content of the dataset and saves it as annotated artifact\n", + "```\n", "\n", - "`\"Experiment 1\"` is a valid value for `ULabel.name` if a record with this name exists in the {class}`~lamindb.ULabel` registry.\n", + "Beyond having valid content, the curated dataset is now queryable via metadata identifiers found in the dataset because they have been validated & linked against LaminDB registries.\n", "\n", - "```" + ":::{admonition} Definition: valid metadata identifier\n", + "\n", + "An identifier like `\"Experiment 1\"` is a valid value for `ULabel.name` if a record with `name` `\"Experiment 1\"` exists in the {class}`~lamindb.ULabel` registry.\n", + "\n", + "```python\n", + "categoricals = {\"experiment\": ln.ULabel.name} # the validation constraint\n", + "curator = ln.Curator.from_df(df, categoricals=categoricals)\n", + "curator.validate()\n", + "```\n", + "\n", + "The DataFrame validates if \n", + "\n", + "- there is a column with name `\"experiment\"` in the dataframe whose values are all found in the `name` field of the {class}`~lamindb.ULabel` registry\n", + "- the column name `\"experiment\"` is found in the `name` field of the {class}`~lamindb.Feature` registry\n", + "\n", + ":::\n", + "\n", + "Beyond validating metadata identifiers, LaminDB also validates data types and dataset schema.\n", + "\n", + ":::{dropdown} How does validation in LaminDB compare to validation in pandera?\n", + "\n", + "Like LaminDB, [pandera](https://pandera.readthedocs.io/) validates the _dataset schema_ (i.e., column names and `dtype`s).\n", + "\n", + "`pandera` is only available for `DataFrame`-like datasets and cannot annotate datasets; i.e., can't make datasets queryable.\n", + "\n", + "However, it offers an API for range-checks, both for numerical and string-like data. If you need such checks, you can combine LaminDB and pandera-based validation.\n", + "\n", + "```python\n", + "import pandas as pd\n", + "import pandera as pa\n", + "\n", + "# data to validate\n", + "df = pd.DataFrame({\n", + " \"column1\": [1, 4, 0, 10, 9],\n", + " \"column2\": [-1.3, -1.4, -2.9, -10.1, -20.4],\n", + " \"column3\": [\"value_1\", \"value_2\", \"value_3\", \"value_2\", \"value_1\"],\n", + "})\n", + "\n", + "# define schema\n", + "schema = pa.DataFrameSchema({\n", + " \"column1\": pa.Column(int, checks=pa.Check.le(10)),\n", + " \"column2\": pa.Column(float, checks=pa.Check.lt(-1.2)),\n", + " \"column3\": pa.Column(str, checks=[\n", + " pa.Check.str_startswith(\"value_\"),\n", + " # define custom checks as functions that take a series as input and\n", + " # outputs a boolean or boolean Series\n", + " pa.Check(lambda s: s.str.split(\"_\", expand=True).shape[1] == 2)\n", + " ]),\n", + "})\n", + "\n", + "validated_df = schema(df) # this corresponds to curator.validate() in LaminDB\n", + "print(validated_df)\n", + "```\n", + "\n", + ":::" ] }, { @@ -42,7 +99,7 @@ "id": "946a3371", "metadata": {}, "source": [ - "## Validate a DataFrame" + "## Curate a DataFrame" ] }, { @@ -72,7 +129,7 @@ "df = pd.DataFrame(\n", " {\n", " \"temperature\": [37.2, 36.3, 38.2],\n", - " \"cell_type\": [\"cerebral pyramidal neuron\", \"astrocyte\", \"oligodendrocyte\"],\n", + " \"cell_type\": [\"cerebral pyramidal neuron\", \"astrocytic glia\", \"oligodendrocyte\"],\n", " \"assay_ontology_id\": [\"EFO:0008913\", \"EFO:0008913\", \"EFO:0008913\"],\n", " \"donor\": [\"D0001\", \"D0002\", \"D0003\"]\n", " },\n", @@ -134,22 +191,54 @@ "curate.validate()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7acf0d2", + "metadata": {}, + "outputs": [], + "source": [ + "# check the non-validated terms\n", + "curate.non_validated" + ] + }, { "cell_type": "markdown", - "id": "7c157df6", + "id": "8c2417c7", "metadata": {}, "source": [ - "## Register new metadata values\n", + "For `cell_type`, we saw that \"cerebral pyramidal neuron\", \"astrocytic glia\" are not validated.\n", "\n", - "If you see \"non-validated\" values, you'll need to decide whether to add them to your registries or \"fix\" them in your dataset." + "First, let's standardize synonym \"astrocytic glia\" as suggested" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35b3ce8e", + "metadata": {}, + "outputs": [], + "source": [ + "curate.standardize(\"cell_type\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "336293ac", + "metadata": {}, + "outputs": [], + "source": [ + "# now we have only one non-validated term left\n", + "curate.non_validated" ] }, { "cell_type": "markdown", - "id": "8c2417c7", + "id": "c1bfe41c", "metadata": {}, "source": [ - "For `cell_type`, we saw that 'cerebral pyramidal neuron' is not validated, let's understand which cell type in the public ontology might be the actual match." + "For \"cerebral pyramidal neuron\", let's understand which cell type in the public ontology might be the actual match." ] }, { @@ -244,7 +333,7 @@ "id": "b9d09a10", "metadata": {}, "source": [ - "## Validate an AnnData\n", + "## Curate an AnnData\n", "\n", "Here we additionally specify which `var_index` to validate against." ] @@ -466,7 +555,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.15" }, "nbproject": { "id": "WOK3vP0bNGLx", diff --git a/lamindb/_curate.py b/lamindb/_curate.py index 528c810ee..318604a2c 100644 --- a/lamindb/_curate.py +++ b/lamindb/_curate.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import warnings from typing import TYPE_CHECKING import anndata as ad @@ -16,6 +17,7 @@ ULabel, ) +from ._from_values import _print_values from .core.exceptions import ValidationError if TYPE_CHECKING: @@ -28,7 +30,21 @@ class CurateLookup: - """Lookup categories from the reference instance.""" + """Lookup categories from the reference instance. + + Args: + categoricals: A dictionary of categorical fields to lookup. + slots: A dictionary of slot fields to lookup. + using_key: The key of the instance to lookup from. Defaults to the + current instance if not specified. + public: Whether to lookup from the public instance. Defaults to False. + + Example: + >>> validator = ln.Validator() + >>> validator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell + + + """ def __init__( self, @@ -37,8 +53,7 @@ def __init__( using_key: str | None = None, public: bool = False, ) -> None: - if slots is None: - slots = {} + slots = slots or {} self._fields = {**categoricals, **slots} self._using_key = None if using_key == "default" else using_key self._using_key_name = self._using_key or ln_setup.settings.instance.slug @@ -54,7 +69,7 @@ def __getattr__(self, name): else: return get_registry_instance(registry, self._using_key).lookup() raise AttributeError( - f"'{self.__class__.__name__}' object has no attribute '{name}'" + f'"{self.__class__.__name__}" object has no attribute "{name}"' ) def __getitem__(self, name): @@ -65,7 +80,7 @@ def __getitem__(self, name): else: return get_registry_instance(registry, self._using_key).lookup() raise AttributeError( - f"'{self.__class__.__name__}' object has no attribute '{name}'" + f'"{self.__class__.__name__}" object has no attribute "{name}"' ) def __repr__(self) -> str: @@ -81,7 +96,7 @@ def __repr__(self) -> str: f"Lookup objects from the {colors.italic(ref)}:\n " f"{colors.green(getattr_keys)}\n " f"{colors.green(getitem_keys)}\n" - "Example:\n → categories = validator.lookup()['cell_type']\n" + 'Example:\n → categories = validator.lookup()["cell_type"]\n' " → categories.alveolar_type_1_fibroblast_cell\n\n" "To look up public ontologies, use .lookup(public=True)" ) @@ -95,10 +110,25 @@ class BaseCurator: def validate(self) -> bool: """Validate dataset. + This method also registers the validated records in the current instance. + Returns: Boolean indicating whether the dataset is validated. """ - pass + pass # pragma: no cover + + def standardize(self, key: str) -> None: + """Replace synonyms with standardized values. + + Inplace modification of the dataset. + + Args: + key: `str` The name of the column to standardize. + + Returns: + None + """ + pass # pragma: no cover def save_artifact( self, @@ -118,7 +148,7 @@ def save_artifact( Returns: A saved artifact record. """ - pass + pass # pragma: no cover class DataFrameCurator(BaseCurator): @@ -127,14 +157,17 @@ class DataFrameCurator(BaseCurator): See also :class:`~lamindb.Curator`. Args: - df: The DataFrame object to curate. - columns: The field attribute for the feature column. - categoricals: A dictionary mapping column names to registry_field. - using_key: The reference instance containing registries to validate against. - verbosity: The verbosity level. - organism: The organism name. - sources: A dictionary mapping column names to Source records. - exclude: A dictionary mapping column names to values to exclude. + df: `pd.DataFrame` The DataFrame object to curate. + columns: `FieldAttr=Feature.name` The field attribute for the feature column. + categoricals: `dict[str, FieldAttr] | None = None` A dictionary mapping column names to registry_field. + using_key: `str | None = None` The reference instance containing registries to validate against. + verbosity: `str = "hint"` The verbosity level. + organism: `str | None = None` The organism name. + sources: `dict[str, Record] | None = None` A dictionary mapping column names to Source records. + exclude: `dict | None = None` A dictionary mapping column names to values to exclude. + + Returns: + A curator object. Examples: >>> import bionty as bt @@ -165,24 +198,21 @@ def __init__( self._fields = categoricals or {} self._columns_field = columns self._using_key = using_key + # TODO: change verbosity back settings.verbosity = verbosity self._artifact = None self._collection = None self._validated = False self._kwargs = {"organism": organism} if organism else {} - if sources is None: - sources = {} - self._sources = sources - if exclude is None: - exclude = {} - self._exclude = exclude + self._sources = sources or {} + self._exclude = exclude or {} self._non_validated = None if check_valid_keys: self._check_valid_keys() self._save_columns() @property - def non_validated(self) -> list: + def non_validated(self) -> dict[str, list[str]]: """Return the non-validated features and labels.""" if self._non_validated is None: raise ValidationError("Please run validate() first!") @@ -200,7 +230,6 @@ def lookup( Args: using_key: The instance where the lookup is performed. - if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator. if "public", the lookup is performed on the public reference. """ return CurateLookup( @@ -210,9 +239,8 @@ def lookup( public=public, ) - def _check_valid_keys(self, extra: set = None) -> None: - if extra is None: - extra = set() + def _check_valid_keys(self, extra: set | None = None) -> None: + extra = extra or set() for name, d in { "categoricals": self._fields, "sources": self._sources, @@ -222,9 +250,12 @@ def _check_valid_keys(self, extra: set = None) -> None: raise TypeError(f"{name} must be a dictionary!") valid_keys = set(self._df.columns) | {"columns"} | extra nonval_keys = [key for key in d.keys() if key not in valid_keys] + n = len(nonval_keys) + s = "s" if n > 1 else "" + are = "are" if n > 1 else "is" if len(nonval_keys) > 0: raise ValidationError( - f"the following keys passed to {name} are not allowed: {nonval_keys}" + f"the following {n} key{s} passed to {name} {are} not allowed: {colors.yellow(_print_values(nonval_keys))}" ) def _save_columns(self, validated_only: bool = True) -> None: @@ -234,7 +265,6 @@ def _save_columns(self, validated_only: bool = True) -> None: values=list(self.fields.keys()), field=self._columns_field, key="columns", - save_function="add_new_from_columns", using_key=self._using_key, validated_only=False, source=self._sources.get("columns"), @@ -249,13 +279,11 @@ def _save_columns(self, validated_only: bool = True) -> None: values=list(additional_columns), field=self._columns_field, key="columns", - save_function="add_new_from_columns", using_key=self._using_key, validated_only=validated_only, df=self._df, # Get the Feature type from df source=self._sources.get("columns"), exclude=self._exclude.get("columns"), - warning=False, # Do not warn about missing columns, just an info message **self._kwargs, # type: ignore ) @@ -265,7 +293,7 @@ def add_new_from(self, key: str, organism: str | None = None, **kwargs): Args: key: The key referencing the slot in the DataFrame from which to draw terms. organism: The organism name. - **kwargs: Additional keyword arguments to pass to the registry model. + **kwargs: Additional keyword arguments to pass to create new records """ if len(kwargs) > 0 and key == "all": raise ValueError("Cannot pass additional arguments to 'all' key!") @@ -273,20 +301,83 @@ def add_new_from(self, key: str, organism: str | None = None, **kwargs): self._update_registry(key, validated_only=False, **self._kwargs, **kwargs) def add_new_from_columns(self, organism: str | None = None, **kwargs): - """Add validated & new column names to its registry. + """Deprecated to run by default during init.""" + warnings.warn( + "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.", + DeprecationWarning, + stacklevel=2, + ) + pass + + def _replace_synonyms( + self, key: str, syn_mapper: dict, values: pd.Series | pd.Index + ): + # replace the values in df + std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val)) + # remove the standardized values from self.non_validated + non_validated = [i for i in self.non_validated[key] if i not in syn_mapper] + if len(non_validated) == 0: + self._non_validated.pop(key, None) # type: ignore + else: + self._non_validated[key] = non_validated # type: ignore + # logging + n = len(syn_mapper) + if n > 0: + syn_mapper_print = _print_values( + [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep="" + ) + s = "s" if n > 1 else "" + logger.success( + f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}' + ) + return std_values + + def standardize(self, key: str): + """Replace synonyms with standardized values. Args: - organism: The organism name. - **kwargs: Additional keyword arguments to pass to the registry model. + key: `str` The key referencing the slot in the DataFrame from which to draw terms. + + Modifies the input dataset inplace. """ - self._kwargs.update({"organism": organism} if organism else {}) - self._save_columns(validated_only=False, **self._kwargs, **kwargs) + # list is needed to avoid RuntimeError: dictionary changed size during iteration + avail_keys = list(self.non_validated.keys()) + if len(avail_keys) == 0: + logger.warning("values are already standardized") + return + + if key == "all": + for k in avail_keys: + if k in self._fields: # needed to exclude var_index + syn_mapper = standardize_categories( + self.non_validated[k], + field=self._fields[k], + using_key=self._using_key, + source=self._sources.get(k), + **self._kwargs, + ) + self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k]) + else: + if key not in avail_keys: + raise KeyError( + f'"{key}" is not a valid key, available keys are: {_print_values(avail_keys)}!' + ) + else: + if key in self._fields: # needed to exclude var_index + syn_mapper = standardize_categories( + self.non_validated[key], + field=self._fields[key], + using_key=self._using_key, + source=self._sources.get(key), + **self._kwargs, + ) + self._df[key] = self._replace_synonyms( + key, syn_mapper, self._df[key] + ) def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs): if categorical == "all": self._update_registry_all(validated_only=validated_only, **kwargs) - elif categorical == "columns": - self._save_columns(validated_only=validated_only, **kwargs) else: if categorical not in self.fields: raise ValidationError( @@ -302,6 +393,9 @@ def _update_registry(self, categorical: str, validated_only: bool = True, **kwar exclude=self._exclude.get(categorical), **kwargs, ) + # adding new records removes them from non_validated + if not validated_only and self._non_validated: + self._non_validated.pop(categorical, None) # type: ignore def _update_registry_all(self, validated_only: bool = True, **kwargs): """Save labels for all features.""" @@ -311,6 +405,10 @@ def _update_registry_all(self, validated_only: bool = True, **kwargs): def validate(self, organism: str | None = None) -> bool: """Validate variables and categorical observations. + This method also registers the validated records in the current instance: + - from public sources + - from the using_key instance + Args: organism: The organism name. @@ -361,10 +459,6 @@ def save_artifact( verbosity = settings.verbosity try: settings.verbosity = "warning" - if not self._validated: - # save all validated records to the current instance - self._update_registry_all() - self._artifact = save_artifact( self._df, description=description, @@ -400,14 +494,15 @@ class AnnDataCurator(DataFrameCurator): See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version. Args: - data: The AnnData object or an AnnData-like path. - var_index: The registry field for mapping the ``.var`` index. - categoricals: A dictionary mapping ``.obs.columns`` to a registry field. - using_key: A reference LaminDB instance. - verbosity: The verbosity level. - organism: The organism name. - sources: A dictionary mapping ``.obs.columns`` to Source records. - exclude: A dictionary mapping column names to values to exclude. + data: `ad.AnnData | UPathStr` The AnnData object or an AnnData-like path. + var_index: `FieldAttr` The registry field for mapping the ``.var`` index. + categoricals: `dict[str, FieldAttr] | None = None` A dictionary mapping ``.obs.columns`` to a registry field. + obs_columns: `FieldAttr` The registry field for mapping the ``.obs.columns``. + using_key: `str | None = None` A reference LaminDB instance. + verbosity: `str = "hint"` The verbosity level. + organism: `str | None = None` The organism name. + sources: `dict[str, Record] | None = None` A dictionary mapping ``.obs.columns`` to Source records. + exclude: `dict | None = None` A dictionary mapping column names to values to exclude. Examples: >>> import bionty as bt @@ -428,7 +523,7 @@ def __init__( var_index: FieldAttr, categoricals: dict[str, FieldAttr] | None = None, obs_columns: FieldAttr = Feature.name, - using_key: str = "default", + using_key: str | None = None, verbosity: str = "hint", organism: str | None = None, sources: dict[str, Record] | None = None, @@ -492,7 +587,6 @@ def lookup( Args: using_key: The instance where the lookup is performed. - if None (default), the lookup is performed on the instance specified in "using" parameter of the validator. if "public", the lookup is performed on the public reference. """ return CurateLookup( @@ -510,7 +604,6 @@ def _save_from_var_index( values=list(self._adata.var.index), field=self.var_index, key="var_index", - save_function=".add_new_from_var_index()", using_key=self._using_key, validated_only=validated_only, organism=organism, @@ -529,7 +622,7 @@ def add_new_from_var_index(self, organism: str | None = None, **kwargs): Args: organism: The organism name. - **kwargs: Additional keyword arguments to pass to the registry model. + **kwargs: Additional keyword arguments to pass to create new records. """ self._kwargs.update({"organism": organism} if organism else {}) self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs) @@ -537,6 +630,8 @@ def add_new_from_var_index(self, organism: str | None = None, **kwargs): def validate(self, organism: str | None = None) -> bool: """Validate categories. + This method also registers the validated records in the current instance. + Args: organism: The organism name. @@ -558,7 +653,7 @@ def validate(self, organism: str | None = None) -> bool: key="var_index", using_key=self._using_key, source=self._sources.get("var_index"), - validated_hint_print=".add_validated_from_var_index()", + hint_print=".add_new_from_var_index()", exclude=self._exclude.get("var_index"), **self._kwargs, # type: ignore ) @@ -576,6 +671,33 @@ def validate(self, organism: str | None = None) -> bool: self._validated = validated_var and validated_obs return self._validated + def standardize(self, key: str): + """Replace synonyms with standardized values. + + Args: + key: `str` The key referencing the slot in `adata.obs` from which to draw terms. Same as the key in `categoricals`. + - If "var_index", standardize the var.index. + - If "all", standardize all obs columns and var.index. + + Inplace modification of the dataset. + """ + if key in self._adata.obs.columns or key == "all": + # standardize obs columns + super().standardize(key) + # in addition to the obs columns, standardize the var.index + if key == "var_index" or key == "all": + syn_mapper = standardize_categories( + self._adata.var.index, + field=self.var_index, + using_key=self._using_key, + source=self._sources.get("var_index"), + **self._kwargs, + ) + if "var_index" in self._non_validated: # type: ignore + self._adata.var.index = self._replace_synonyms( + "var_index", syn_mapper, self._adata.var.index + ) + def save_artifact( self, description: str | None = None, @@ -603,9 +725,6 @@ def save_artifact( verbosity = settings.verbosity try: settings.verbosity = "warning" - if not self._validated: - # save all validated records to the current instance - self._update_registry_all() self._artifact = save_artifact( self._data, adata=self._adata, @@ -631,17 +750,17 @@ class MuDataCurator: the object should be recreated using :meth:`~lamindb.Curator.from_mudata`. Args: - mdata: The MuData object to curate. - var_index: The registry field for mapping the ``.var`` index for each modality. + mdata: `MuData` The MuData object to curate. + var_index: `dict[str, dict[str, FieldAttr]]` The registry field for mapping the ``.var`` index for each modality. For example: ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}`` - categoricals: A dictionary mapping ``.obs.columns`` to a registry field. + categoricals: `dict[str, FieldAttr] | None = None` A dictionary mapping ``.obs.columns`` to a registry field. Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`. - using_key: A reference LaminDB instance. - verbosity: The verbosity level. - organism: The organism name. - sources: A dictionary mapping ``.obs.columns`` to Source records. - exclude: A dictionary mapping column names to values to exclude. + using_key: `str | None = None` A reference LaminDB instance. + verbosity: `str = "hint"` The verbosity level. + organism: `str | None = None` The organism name. + sources: `dict[str, Record] | None = None` A dictionary mapping ``.obs.columns`` to Source records. + exclude: `dict | None = None` A dictionary mapping column names to values to exclude. Examples: >>> import bionty as bt @@ -664,11 +783,11 @@ def __init__( mdata: MuData, var_index: dict[str, dict[str, FieldAttr]], categoricals: dict[str, FieldAttr] | None = None, - using_key: str = "default", + using_key: str | None = None, verbosity: str = "hint", organism: str | None = None, sources: dict[str, Record] | None = None, - exclude: dict | None = None, + exclude: dict | None = None, # {modality: {field: [values]}} ) -> None: if sources is None: sources = {} @@ -684,19 +803,34 @@ def __init__( self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys()) self._using_key = using_key self._verbosity = verbosity - self._df_annotators = { - modality: DataFrameCurator( - df=mdata[modality].obs if modality != "obs" else mdata.obs, - categoricals=self._obs_fields.get(modality, {}), + self._obs_df_curator = None + if "obs" in self._modalities: + self._obs_df_curator = DataFrameCurator( + df=mdata.obs, + columns=Feature.name, + categoricals=self._obs_fields.get("obs", {}), + using_key=using_key, + verbosity=verbosity, + sources=self._sources.get("obs"), + exclude=self._exclude.get("obs"), + check_valid_keys=False, + **self._kwargs, + ) + self._mod_adata_curators = { + modality: AnnDataCurator( + data=mdata[modality], + var_index=var_index.get(modality), + categoricals=self._obs_fields.get(modality), using_key=using_key, verbosity=verbosity, sources=self._sources.get(modality), exclude=self._exclude.get(modality), - check_valid_keys=False, **self._kwargs, ) for modality in self._modalities + if modality != "obs" } + self._non_validated = None @property def var_index(self) -> FieldAttr: @@ -708,29 +842,19 @@ def categoricals(self) -> dict: """Return the obs fields to validate against.""" return self._obs_fields + @property + def non_validated(self) -> dict[str, dict[str, list[str]]]: + """Return the non-validated features and labels.""" + if self._non_validated is None: + raise ValidationError("Please run validate() first!") + return self._non_validated + def _verify_modality(self, modalities: Iterable[str]): """Verify the modality exists.""" for modality in modalities: if modality not in self._mdata.mod.keys(): raise ValidationError(f"modality '{modality}' does not exist!") - def _save_from_var_index_modality( - self, modality: str, validated_only: bool = True, **kwargs - ): - """Save variable records.""" - update_registry( - values=list(self._mdata[modality].var.index), - field=self._var_fields[modality], - key="var_index", - save_function=f'.add_new_from_var_index("{modality}")', - using_key=self._using_key, - validated_only=validated_only, - dtype="number", - source=self._sources.get(modality, {}).get("var_index"), - exclude=self._exclude.get(modality, {}).get("var_index"), - **kwargs, - ) - def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict: """Parse the categorical fields.""" prefixes = {f"{k}:" for k in self._mdata.mod.keys()} @@ -756,13 +880,18 @@ def lookup( Args: using_key: The instance where the lookup is performed. - if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator. if "public", the lookup is performed on the public reference. """ + obs_fields = {} + for mod, fields in self._obs_fields.items(): + for k, v in fields.items(): + if k == "obs": + obs_fields[k] = v + else: + obs_fields[f"{mod}:{k}"] = v return CurateLookup( - categoricals=self._obs_fields, + categoricals=obs_fields, slots={ - **self._obs_fields, **{f"{k}_var_index": v for k, v in self._var_fields.items()}, }, using_key=using_key or self._using_key, @@ -776,27 +905,11 @@ def add_new_from_columns( organism: str | None = None, **kwargs, ): - """Update columns records. - - Args: - modality: The modality name. - column_names: The column names to save. - organism: The organism name. - **kwargs: Additional keyword arguments to pass to the registry model. - """ - self._kwargs.update({"organism": organism} if organism else {}) - values = column_names or self._mdata[modality].obs.columns - update_registry( - values=list(values), - field=Feature.name, - key=f"{modality} obs columns", - using_key=self._using_key, - validated_only=False, - df=self._mdata[modality].obs, - source=self._sources.get(modality, {}).get("columns"), - exclude=self._exclude.get(modality, {}).get("columns"), - **self._kwargs, # type: ignore - **kwargs, + """Update columns records.""" + warnings.warn( + "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.", + DeprecationWarning, + stacklevel=2, ) def add_new_from_var_index( @@ -807,21 +920,21 @@ def add_new_from_var_index( Args: modality: The modality name. organism: The organism name. - **kwargs: Additional keyword arguments to pass to the registry model. + **kwargs: Additional keyword arguments to pass to create new records. """ self._kwargs.update({"organism": organism} if organism else {}) - self._save_from_var_index_modality( - modality=modality, validated_only=False, **self._kwargs, **kwargs + self._mod_adata_curators[modality].add_new_from_var_index( + **self._kwargs, **kwargs ) def _update_registry_all(self): """Update all registries.""" - for modality in self._var_fields.keys(): - self._save_from_var_index_modality( - modality=modality, validated_only=True, **self._kwargs + if self._obs_df_curator is not None: + self._obs_df_curator._update_registry_all( + validated_only=True, **self._kwargs ) - for _, df_annotator in self._df_annotators.items(): - df_annotator._update_registry_all(validated_only=True, **self._kwargs) + for _, adata_curator in self._mod_adata_curators.items(): + adata_curator._update_registry_all(validated_only=True, **self._kwargs) def add_new_from( self, @@ -836,15 +949,17 @@ def add_new_from( key: The key referencing the slot in the DataFrame. modality: The modality name. organism: The organism name. - **kwargs: Additional keyword arguments to pass to the registry model. + **kwargs: Additional keyword arguments to pass to create new records. """ if len(kwargs) > 0 and key == "all": raise ValueError("Cannot pass additional arguments to 'all' key!") self._kwargs.update({"organism": organism} if organism else {}) modality = modality or "obs" - if modality in self._df_annotators: - df_annotator = self._df_annotators[modality] - df_annotator.add_new_from(key=key, **self._kwargs, **kwargs) + if modality in self._mod_adata_curators: + adata_curator = self._mod_adata_curators[modality] + adata_curator.add_new_from(key=key, **self._kwargs, **kwargs) + if modality == "obs": + self._obs_df_curator.add_new_from(key=key, **self._kwargs, **kwargs) def validate(self, organism: str | None = None) -> bool: """Validate categories.""" @@ -853,7 +968,7 @@ def validate(self, organism: str | None = None) -> bool: self._kwargs.update({"organism": organism} if organism else {}) if self._using_key is not None and self._using_key != "default": logger.important( - f"validating metadata using registries of instance {colors.italic(self._using_key)}" + f"validating using registries of instance {colors.italic(self._using_key)}" ) # add all validated records to the current instance @@ -864,49 +979,42 @@ def validate(self, organism: str | None = None) -> bool: finally: settings.verbosity = verbosity - validated_var = True - non_validated_var_modality = {} - for modality, var_field in self._var_fields.items(): - is_validated_var, non_validated_var = validate_categories( - self._mdata[modality].var.index, - field=var_field, - key=f"{modality}_var_index", - using_key=self._using_key, - source=self._sources.get(modality, {}).get("var_index"), - exclude=self._exclude.get(modality, {}).get("var_index"), - validated_hint_print=f'.add_validated_from_var_index("{modality}")', - **self._kwargs, # type: ignore - ) - validated_var &= is_validated_var - if len(non_validated_var) > 0: - non_validated_var_modality[modality] = non_validated_var + self._non_validated = {} # type: ignore - validated_obs = True - non_validated_obs_modality = {} - for modality, fields in self._obs_fields.items(): - if modality == "obs": - obs = self._mdata.obs - else: - obs = self._mdata[modality].obs - is_validated_obs, non_validated_obs = validate_categories_in_df( - obs, - fields=fields, - using_key=self._using_key, - sources=self._sources.get(modality), - exclude=self._exclude.get(modality), - **self._kwargs, - ) - validated_obs &= is_validated_obs - non_validated_obs_modality[modality] = non_validated_obs - if modality in non_validated_var_modality: - non_validated_obs_modality[modality]["var_index"] = ( - non_validated_var_modality[modality] - ) - if len(non_validated_obs_modality[modality]) > 0: - self._non_validated = non_validated_obs_modality[modality] - self._validated = validated_var and validated_obs + obs_validated = True + if "obs" in self._modalities: + logger.info('validating categoricals in "obs"...') + obs_validated &= self._obs_df_curator.validate(**self._kwargs) + self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore + logger.print("") + + mods_validated = True + for modality, adata_curator in self._mod_adata_curators.items(): + logger.info(f'validating categoricals in modality "{modality}"...') + mods_validated &= adata_curator.validate(**self._kwargs) + if len(adata_curator.non_validated) > 0: + self._non_validated[modality] = adata_curator.non_validated # type: ignore + logger.print("") + + self._validated = obs_validated & mods_validated return self._validated + def standardize(self, key: str, modality: str | None = None): + """Replace synonyms with standardized values. + + Args: + key: `str` The key referencing the slot in the `MuData`. + modality: `str | None = None` The modality name. + + Inplace modification of the dataset. + """ + modality = modality or "obs" + if modality in self._mod_adata_curators: + adata_curator = self._mod_adata_curators[modality] + adata_curator.standardize(key=key) + if modality == "obs": + self._obs_df_curator.standardize(key=key) + def save_artifact( self, description: str | None = None, @@ -934,10 +1042,6 @@ def save_artifact( verbosity = settings.verbosity try: settings.verbosity = "warning" - if not self._validated: - # save all validated records to the current instance - self._update_registry_all() - self._artifact = save_artifact( self._mdata, description=description, @@ -1007,7 +1111,7 @@ def from_anndata( var_index: FieldAttr, categoricals: dict[str, FieldAttr] | None = None, obs_columns: FieldAttr = Feature.name, - using_key: str = "default", + using_key: str | None = None, verbosity: str = "hint", organism: str | None = None, sources: dict[str, Record] | None = None, @@ -1031,7 +1135,7 @@ def from_mudata( mdata: MuData, var_index: dict[str, dict[str, FieldAttr]], categoricals: dict[str, FieldAttr] | None = None, - using_key: str = "default", + using_key: str | None = None, verbosity: str = "hint", organism: str | None = None, ) -> MuDataCurator: @@ -1081,15 +1185,14 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict: return filter_kwargs -def standardize_and_inspect( +def inspect_instance( values: Iterable[str], field: FieldAttr, registry: type[Record], - standardize: bool = False, exclude: str | list | None = None, **kwargs, ): - """Standardize and inspect values using a registry.""" + """Inspect values using a registry.""" # inspect exclude values in the default instance values = list(values) include_validated = [] @@ -1103,16 +1206,6 @@ def standardize_and_inspect( values = [i for i in values if i not in inspect_result_exclude.validated] include_validated = inspect_result_exclude.validated - if standardize: - if hasattr(registry, "standardize") and hasattr( - registry, - "synonyms", # https://github.com/laminlabs/lamindb/issues/1685 - ): - standardized_values = registry.standardize( - values, field=field, mute=True, **kwargs - ) - values = standardized_values - inspect_result = registry.inspect(values, field=field, mute=True, **kwargs) inspect_result._validated += include_validated inspect_result._non_validated = [ @@ -1144,8 +1237,7 @@ def validate_categories( organism: str | None = None, source: Record | None = None, exclude: str | list | None = None, - standardize: bool = True, - validated_hint_print: str | None = None, + hint_print: str | None = None, ) -> tuple[bool, list]: """Validate ontology terms in a pandas series using LaminDB registries. @@ -1158,7 +1250,7 @@ def validate_categories( source: The source record. exclude: Exclude specific values from validation. standardize: Whether to standardize the values. - validated_hint_print: The hint to print for validated values. + hint_print: The hint to print that suggests fixing non-validated values. """ from lamindb._from_values import _print_values from lamindb.core._settings import settings @@ -1167,42 +1259,43 @@ def validate_categories( def _log_mapping_info(): logger.indent = "" - logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}") - logger.indent = " " + logger.info(f'mapping "{key}" on {colors.italic(model_field)}') + logger.indent = " " registry = field.field.model + # {"organism": organism_name/organism_record} kwargs = check_registry_organism(registry, organism) kwargs.update({"source": source} if source else {}) kwargs_current = get_current_filter_kwargs(registry, kwargs) - # inspect the default instance - inspect_result = standardize_and_inspect( + # inspect values from the default instance + inspect_result = inspect_instance( values=values, field=field, registry=registry, - standardize=standardize, exclude=exclude, **kwargs_current, ) non_validated = inspect_result.non_validated + syn_mapper = inspect_result.synonyms_mapper - # inspect the using instance + # inspect the non-validated values from the using_key instance values_validated = [] if using_key is not None and using_key != "default" and non_validated: registry_using = get_registry_instance(registry, using_key) - inspect_result = standardize_and_inspect( + inspect_result = inspect_instance( values=non_validated, field=field, registry=registry_using, - standardize=standardize, exclude=exclude, **kwargs, ) non_validated = inspect_result.non_validated values_validated += inspect_result.validated + syn_mapper.update(inspect_result.synonyms_mapper) - # inspect from public (bionty only) + # inspect the non-validated values from public (bionty only) if hasattr(registry, "public"): verbosity = settings.verbosity try: @@ -1216,39 +1309,35 @@ def _log_mapping_info(): finally: settings.verbosity = verbosity - validated_hint_print = validated_hint_print or f".add_validated_from('{key}')" - n_validated = len(values_validated) - - if n_validated > 0: - _log_mapping_info() - terms_str = f"{', '.join([f'{chr(39)}{v}{chr(39)}' for v in values_validated[:10]])}{', ...' if len(values_validated) > 10 else ''}" - val_numerous = "" if n_validated == 1 else "s" - logger.warning( - f"found {colors.yellow(n_validated)} validated term{val_numerous}: " - f"{colors.yellow(terms_str)}\n" - f"→ save term{val_numerous} via {colors.yellow(validated_hint_print)}" - ) - - non_validated_hint_print = validated_hint_print.replace("_validated_", "_new_") + # logging messages + non_validated_hint_print = hint_print or f'.add_new_from("{key}")' non_validated = [i for i in non_validated if i not in values_validated] n_non_validated = len(non_validated) if n_non_validated == 0: - if n_validated == 0: + if len(values_validated) == 0: + # nothing to validate logger.indent = "" - logger.success(f"'{key}' is validated against {colors.italic(model_field)}") + logger.success(f'"{key}" is validated against {colors.italic(model_field)}') return True, [] else: # validated values still need to be saved to the current instance return False, [] else: - non_val_numerous = ("", "is") if n_non_validated == 1 else ("s", "are") + are = "is" if n_non_validated == 1 else "are" + s = "" if n_non_validated == 1 else "s" print_values = _print_values(non_validated) - warning_message = ( - f"{colors.red(f'{n_non_validated} term{non_val_numerous[0]}')} {non_val_numerous[1]} not validated: " - f"{colors.red(', '.join(print_values.split(', ')[:10]) + ', ...' if len(print_values.split(', ')) > 10 else print_values)}\n" - f"→ fix typo{non_val_numerous[0]}, remove non-existent value{non_val_numerous[0]}, or save term{non_val_numerous[0]} via " - f"{colors.red(non_validated_hint_print)}" - ) + warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n" + if syn_mapper: + s = "" if len(syn_mapper) == 1 else "s" + syn_mapper_print = _print_values( + [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep="" + ) + hint_msg = f'.standardize("{key}")' + warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}" + if n_non_validated > len(syn_mapper): + if syn_mapper: + warning_message += " for remaining terms:\n" + warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}" if logger.indent == "": _log_mapping_info() @@ -1257,6 +1346,44 @@ def _log_mapping_info(): return False, non_validated +def standardize_categories( + values: Iterable[str], + field: FieldAttr, + using_key: str | None = None, + organism: str | None = None, + source: Record | None = None, +) -> dict: + """Get a synonym mapper.""" + registry = field.field.model + if not hasattr(registry, "standardize"): + return {} + # standardize values using the default instance + syn_mapper = registry.standardize( + values, + field=field.field.name, + organism=organism, + source=source, + mute=True, + return_mapper=True, + ) + + if len(values) > len(syn_mapper): # type: ignore + # standardize values using the using_key instance + if using_key is not None and using_key != "default": + registry_using = get_registry_instance(registry, using_key) + syn_mapper.update( + registry_using.standardize( + [v for v in values if v not in syn_mapper], + field=field.field.name, + organism=organism, + source=source, + mute=True, + return_mapper=True, + ) + ) + return syn_mapper + + def validate_categories_in_df( df: pd.DataFrame, fields: dict[str, FieldAttr], @@ -1304,9 +1431,9 @@ def save_artifact( Args: data: The DataFrame or AnnData object to save. - description: A description of the artifact. fields: A dictionary mapping obs_column to registry_field. columns_field: The registry field to validate variables index against. + description: A description of the artifact. organism: The organism name. adata: The AnnData object to save and get n_observations, must be provided if data is a path. type: `Literal["dataset", "model"] | None = None` The artifact type. @@ -1457,15 +1584,12 @@ def update_registry( values: list[str], field: FieldAttr, key: str, - save_function: str = "add_new_from", using_key: str | None = None, validated_only: bool = True, df: pd.DataFrame | None = None, organism: str | None = None, dtype: str | None = None, source: Record | None = None, - standardize: bool = True, - warning: bool = True, exclude: str | list | None = None, **kwargs, ) -> None: @@ -1475,13 +1599,13 @@ def update_registry( values: A list of values to be saved as labels. field: The FieldAttr object representing the field for which labels are being saved. key: The name of the feature to save. - save_function: The name of the function to save the labels. using_key: The name of the instance from which to transfer labels (if applicable). validated_only: If True, only save validated labels. df: A DataFrame to save labels from. organism: The organism name. dtype: The type of the feature. source: The source record. + exclude: Values to exclude from inspect. kwargs: Additional keyword arguments to pass to the registry model to create new records. """ from lamindb._save import save as ln_save @@ -1490,78 +1614,55 @@ def update_registry( registry = field.field.model filter_kwargs = check_registry_organism(registry, organism) filter_kwargs.update({"source": source} if source else {}) + if not values: + return verbosity = settings.verbosity try: settings.verbosity = "error" + labels_saved: dict = {"from public": [], "new": []} - # save from public + # inspect the default instance and save validated records from public filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs) - existing_and_public_records = ( - registry.from_values( - list(values), - field=field, - **filter_kwargs_current, - ) - if values - else [] + existing_and_public_records = registry.from_values( + list(values), field=field, **filter_kwargs_current ) - - labels_saved: dict = {"from public": [], "without reference": []} - + existing_and_public_labels = [ + getattr(r, field.field.name) for r in existing_and_public_records + ] + # public records that are not already in the database public_records = [r for r in existing_and_public_records if r._state.adding] # here we check to only save the public records if they are from the specified source # we check the uid because r.source and source can be from different instances if source: public_records = [r for r in public_records if r.source.uid == source.uid] - - if public_records: + if len(public_records) > 0: settings.verbosity = "info" logger.info(f"saving validated records of '{key}'") settings.verbosity = "error" - ln_save(public_records) - labels_saved["from public"] = [ - getattr(r, field.field.name) for r in public_records + ln_save(public_records) + labels_saved["from public"] = [ + getattr(r, field.field.name) for r in public_records + ] + # non-validated records from the default instance + non_validated_labels = [ + i for i in values if i not in existing_and_public_labels ] - non_public_labels = [i for i in values if i not in labels_saved["from public"]] - - # inspect the default instance - inspect_result_current = standardize_and_inspect( - values=non_public_labels, - field=field, - registry=registry, - standardize=standardize, - exclude=exclude, - **filter_kwargs_current, - ) - if not inspect_result_current.non_validated: - all_labels = registry.from_values( - inspect_result_current.validated, - field=field, - **filter_kwargs_current, - ) - settings.verbosity = verbosity - return all_labels - # inspect the using_key instance + # inspect and save validated records the using_key instance ( labels_saved[f"from {using_key}"], non_validated_labels, ) = update_registry_from_using_instance( - inspect_result_current.non_validated, + non_validated_labels, field=field, using_key=using_key, exclude=exclude, **filter_kwargs, ) - labels_saved["without reference"] = [ - i - for i in non_validated_labels - if i not in labels_saved[f"from {using_key}"] - ] - - # save non-validated records + # save non-validated/new records + labels_saved["new"] = non_validated_labels if not validated_only: non_validated_records = [] if df is not None and registry == Feature: @@ -1572,7 +1673,7 @@ def update_registry( # make sure organism record is saved to the current instance filter_kwargs["organism"] = _save_organism(name=organism) init_kwargs = {} - for value in labels_saved["without reference"]: + for value in labels_saved["new"]: init_kwargs[field.field.name] = value if registry == Feature: init_kwargs["dtype"] = "cat" if dtype is None else dtype @@ -1585,38 +1686,26 @@ def update_registry( ) ln_save(non_validated_records) - # save parent labels for ulabels + # save parent labels for ulabels, for example a parent label "project" for label "project001" if registry == ULabel and field.field.name == "name": - save_ulabels_with_parent(values, field=field, key=key) - - # # get all records that are now validated in the current instance - # all_labels = registry.from_values( - # inspect_result_current.validated + inspect_result_current.non_validated, - # field=field, - # **get_current_filter_kwargs(registry, filter_kwargs), - # ) + save_ulabels_parent(values, field=field, key=key) + finally: settings.verbosity = verbosity log_saved_labels( labels_saved, key=key, - save_function=save_function, model_field=f"{registry.__name__}.{field.field.name}", validated_only=validated_only, - warning=warning, ) - # return all_labels - def log_saved_labels( labels_saved: dict, key: str, - save_function: str, model_field: str, validated_only: bool = True, - warning: bool = True, ) -> None: """Log the saved labels.""" from ._from_values import _print_values @@ -1625,45 +1714,26 @@ def log_saved_labels( for k, labels in labels_saved.items(): if not labels: continue - - if k == "without reference" and validated_only: + if k == "new" and validated_only: continue - # msg = colors.yellow( - # f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!" - # ) - # lookup_print = ( - # f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']" - # ) - - # hint = f".add_new_from('{key}')" - # msg += f"\n → to lookup values, use {lookup_print}" - # msg += ( - # f"\n → to save, run {colors.yellow(hint)}" - # if save_function == "add_new_from" - # else f"\n → to save, run {colors.yellow(save_function)}" - # ) - # if warning: - # logger.warning(msg) - # else: - # logger.info(msg) else: - k = "" if k == "without reference" else f"{colors.green(k)} " + k = "" if k == "new" else f"{colors.green(k)} " # the term "transferred" stresses that this is always in the context of transferring # labels from a public ontology or a different instance to the present instance s = "s" if len(labels) > 1 else "" logger.success( - f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}" + f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_print_values(labels)}' ) -def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None: +def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None: """Save a parent label for the given labels.""" registry = field.field.model assert registry == ULabel # noqa: S101 all_records = registry.from_values(list(values), field=field) - is_feature = registry.filter(name=f"is_{key}").one_or_none() + is_feature = registry.filter(name=f"{key}").one_or_none() if is_feature is None: - is_feature = registry(name=f"is_{key}").save() + is_feature = registry(name=f"{key}").save() logger.important(f"Created a parent ULabel: {is_feature}") is_feature.children.add(*all_records) @@ -1672,7 +1742,6 @@ def update_registry_from_using_instance( values: list[str], field: FieldAttr, using_key: str | None = None, - standardize: bool = False, exclude: str | list | None = None, **kwargs, ) -> tuple[list[str], list[str]]: @@ -1682,7 +1751,6 @@ def update_registry_from_using_instance( values: A list of values to be saved as labels. field: The FieldAttr object representing the field for which labels are being saved. using_key: The name of the instance from which to transfer labels (if applicable). - standardize: Whether to also standardize the values. kwargs: Additional keyword arguments to pass to the registry model. Returns: @@ -1694,11 +1762,10 @@ def update_registry_from_using_instance( if using_key is not None and using_key != "default": registry_using = get_registry_instance(field.field.model, using_key) - inspect_result_using = standardize_and_inspect( + inspect_result_using = inspect_instance( values=values, field=field, registry=registry_using, - standardize=standardize, exclude=exclude, **kwargs, ) @@ -1713,7 +1780,7 @@ def update_registry_from_using_instance( return labels_saved, not_saved -def _save_organism(name: str): # pragma: no cover +def _save_organism(name: str): """Save an organism record.""" import bionty as bt @@ -1722,8 +1789,8 @@ def _save_organism(name: str): # pragma: no cover organism = bt.Organism.from_source(name=name) if organism is None: raise ValidationError( - f"Organism '{name}' not found\n" - f" → please save it: bt.Organism(name='{name}').save()" + f'Organism "{name}" not found\n' + f' → please save it: bt.Organism(name="{name}").save()' ) organism.save() return organism diff --git a/lamindb/_from_values.py b/lamindb/_from_values.py index 295fe4709..585d35b88 100644 --- a/lamindb/_from_values.py +++ b/lamindb/_from_values.py @@ -305,7 +305,9 @@ def index_iterable(iterable: Iterable) -> pd.Index: return idx[(idx != "") & (~idx.isnull())] -def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str: +def _print_values( + names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'" +) -> str: if isinstance(names, dict): items = { f"{key}: {value}": None @@ -319,7 +321,7 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str: unique_items = list(items.keys()) if quotes: - unique_items = [f"'{item}'" for item in unique_items] + unique_items = [f"{sep}{item}{sep}" for item in unique_items] print_values = ", ".join(unique_items[:n]) diff --git a/lamindb/core/_django.py b/lamindb/core/_django.py index fd9ce9e38..29c6f8b2e 100644 --- a/lamindb/core/_django.py +++ b/lamindb/core/_django.py @@ -179,14 +179,11 @@ def get_featureset_m2m_relations( # Get the correct field names for the through table through_model = getattr(FeatureSet, name).through - related_field = ( - through_model.__name__.replace("FeatureSet", "").lower().replace("_", "") - ) # Subquery to get limited related records limited_related = Subquery( through_model.objects.filter(featureset=OuterRef("pk")).values( - related_field + related_model.__name__.lower() )[:limit] ) diff --git a/tests/core/test_curate.py b/tests/core/test_curate.py index e0341eafd..1a0094d69 100644 --- a/tests/core/test_curate.py +++ b/tests/core/test_curate.py @@ -6,18 +6,16 @@ import mudata as md import pandas as pd import pytest -from lamindb._curate import CurateLookup -from lamindb.core.exceptions import ValidationError +from lamindb._curate import CurateLookup, ValidationError -@pytest.fixture(scope="module") +@pytest.fixture def df(): return pd.DataFrame( { "cell_type": [ - # there is an error in the below annotation on purpose - "cerebral pyramidal neuron", - "astrocyte", + "cerebral pyramidal neuron", # on purpose, should be "cerebral cortex pyramidal neuron" + "astrocytic glia", # synonym of astrocyte "oligodendrocyte", ], "cell_type_2": ["oligodendrocyte", "oligodendrocyte", "astrocyte"], @@ -27,13 +25,13 @@ def df(): ) -@pytest.fixture(scope="module") +@pytest.fixture def adata(): df = pd.DataFrame( { "cell_type": [ "cerebral cortex pyramidal neuron", - "astrocyte", + "astrocytic glia", # synonym of astrocyte "oligodendrocyte", ], "cell_type_2": [ @@ -49,7 +47,7 @@ def adata(): X = pd.DataFrame( { - "TCF7": [1, 2, 3], + "TCF-1": [1, 2, 3], # synonym of TCF7 "PDCD1": [4, 5, 6], "CD3E": [7, 8, 9], "CD4": [10, 11, 12], @@ -61,9 +59,11 @@ def adata(): return ad.AnnData(X=X, obs=df) -@pytest.fixture(scope="module") +@pytest.fixture def mdata(adata): - mdata = md.MuData({"rna": adata, "rna_2": adata}) + # can't be the same adata object due to in-place modifications + mdata = md.MuData({"rna": adata, "rna_2": adata.copy()}) + mdata.obs["donor"] = ["D0001", "D0002", "DOOO3"] return mdata @@ -99,16 +99,43 @@ def mock_transform(): def test_df_curator(df, categoricals): curator = ln.Curator.from_df(df, categoricals=categoricals) + with pytest.raises(ValidationError): + _ = curator.non_validated validated = curator.validate() + assert curator.non_validated == { + "cell_type": ["cerebral pyramidal neuron", "astrocytic glia"], + "donor": ["D0001", "D0002", "DOOO3"], + } assert validated is False + # deprecated method + curator.add_new_from_columns() + + # standardize + with pytest.raises(KeyError): + curator.standardize("nonexistent-key") + curator.standardize("all") + assert curator.non_validated == { + "cell_type": ["cerebral pyramidal neuron"], + "donor": ["D0001", "D0002", "DOOO3"], + } + assert "astrocyte" in df["cell_type"].values + + # add new + curator.add_new_from("donor") + assert curator.non_validated == {"cell_type": ["cerebral pyramidal neuron"]} + + # lookup cell_types = curator.lookup(public=True)["cell_type"] df["cell_type"] = df["cell_type"].replace( {"cerebral pyramidal neuron": cell_types.cerebral_cortex_pyramidal_neuron.name} ) - curator.add_new_from("donor") validated = curator.validate() assert validated is True + assert curator.non_validated == {} + + # no need to standardize + curator.standardize("cell_type") artifact = curator.save_artifact(description="test-curate-df") @@ -153,12 +180,13 @@ def test_df_curator(df, categoricals): ln.ULabel.filter().delete() bt.ExperimentalFactor.filter().delete() bt.CellType.filter().delete() + ln.FeatureSet.filter().delete() def test_custom_using_invalid_field_lookup(curate_lookup): with pytest.raises(AttributeError) as excinfo: _ = curate_lookup["invalid_field"] - assert "'CurateLookup' object has no attribute 'invalid_field'" in str( + assert '"CurateLookup" object has no attribute "invalid_field"' in str( excinfo.value ) @@ -215,13 +243,48 @@ def test_clean_up_failed_runs(): @pytest.mark.parametrize("to_add", ["donor", "all"]) def test_anndata_curator(adata, categoricals, to_add): + # must pass an organism + with pytest.raises(ValidationError): + bt.settings._organism = None # make sure organism is not set globally + ln.Curator.from_anndata( + adata, + categoricals=categoricals, + var_index=bt.Gene.symbol, + ).validate() + curator = ln.Curator.from_anndata( adata, categoricals=categoricals, var_index=bt.Gene.symbol, organism="human", ) + validated = curator.validate() + assert validated is False + assert curator.non_validated == { + "cell_type": ["astrocytic glia"], + "donor": ["D0001", "D0002", "DOOO3"], + "var_index": ["TCF-1"], + } + + # standardize var_index + curator.standardize("var_index") + assert "TCF7" in adata.var.index + assert curator.non_validated == { + "cell_type": ["astrocytic glia"], + "donor": ["D0001", "D0002", "DOOO3"], + } + curator.standardize("all") + assert curator.non_validated == {"donor": ["D0001", "D0002", "DOOO3"]} + + # lookup + lookup = curator.lookup() + assert lookup.cell_type.oligodendrocyte.name == "oligodendrocyte" + + # add new curator.add_new_from(to_add) + assert curator.non_validated == {} + # just for coverage, doesn't do anything + curator.add_new_from_var_index() validated = curator.validate() assert validated @@ -242,6 +305,8 @@ def test_anndata_curator(adata, categoricals, to_add): ln.ULabel.filter().delete() bt.ExperimentalFactor.filter().delete() bt.CellType.filter().delete() + ln.FeatureSet.filter().delete() + bt.Gene.filter().delete() def test_str_var_index(adata): @@ -253,14 +318,14 @@ def test_str_var_index(adata): ) -def test_no_categoricals(adata): +def test_not_passing_categoricals(adata): curator = ln.Curator.from_anndata( adata, var_index=bt.Gene.symbol, organism="human", ) validated = curator.validate() - assert validated + assert validated is False def test_anndata_curator_wrong_type(df, categoricals): @@ -276,7 +341,7 @@ def test_anndata_curator_wrong_type(df, categoricals): def test_categorical_key_not_present(df): with pytest.raises( ValidationError, - match="the following keys passed to categoricals are not allowed:", + match="the following 1 key passed to categoricals is not allowed:", ): ln.Curator.from_df( df, @@ -287,7 +352,8 @@ def test_categorical_key_not_present(df): def test_source_key_not_present(adata, categoricals): with pytest.raises( - ValidationError, match="the following keys passed to sources are not allowed:" + ValidationError, + match="the following 1 key passed to sources is not allowed:", ): ln.Curator.from_anndata( adata, @@ -319,6 +385,7 @@ def test_mudata_curator(mdata): "rna_2:cell_type": bt.CellType.name, "rna_2:assay_ontology_id": bt.ExperimentalFactor.ontology_id, "rna_2:donor": ln.ULabel.name, + "donor": ln.ULabel.name, } curator = ln.Curator.from_mudata( @@ -327,7 +394,42 @@ def test_mudata_curator(mdata): var_index={"rna": bt.Gene.symbol, "rna_2": bt.Gene.symbol}, organism="human", ) - curator.add_new_from("donor", modality="rna") + with pytest.raises(ValidationError): + _ = curator.non_validated + assert curator._modalities == {"obs", "rna", "rna_2"} + + # validate + validated = curator.validate() + assert curator.non_validated == { + "obs": {"donor": ["D0001", "D0002", "DOOO3"]}, + "rna_2": { + "cell_type": ["astrocytic glia"], + "donor": ["D0001", "D0002", "DOOO3"], + "var_index": ["TCF-1"], + }, + "rna": { + "cell_type": ["astrocytic glia"], + "donor": ["D0001", "D0002", "DOOO3"], + "var_index": ["TCF-1"], + }, + } + + # lookup + lookup = curator.lookup() + assert lookup["obs:donor"].donor.name == "donor" + + # standardize + curator.standardize("all", modality="rna") + curator.standardize("all", modality="rna_2") + assert curator._mod_adata_curators["rna_2"].non_validated == { + "donor": ["D0001", "D0002", "DOOO3"] + } + + # add new + curator.add_new_from_columns("rna") # deprecated, doesn't do anything + curator.add_new_from_var_index("rna") # doesn't do anything + curator.add_new_from("donor") + validated = curator.validate() assert validated artifact = curator.save_artifact(description="test MuData") @@ -337,3 +439,5 @@ def test_mudata_curator(mdata): ln.ULabel.filter().delete() bt.ExperimentalFactor.filter().delete() bt.CellType.filter().delete() + ln.FeatureSet.filter().delete() + bt.Gene.filter().delete()