diff --git a/docs/curate-df.ipynb b/docs/curate-df.ipynb index efa1ef1bc..0913f59e1 100644 --- a/docs/curate-df.ipynb +++ b/docs/curate-df.ipynb @@ -86,7 +86,7 @@ "id": "c7d74af9", "metadata": {}, "source": [ - "Define validation criteria and create a {class}`~lamindb.Curate` object." + "Define validation criteria and create a {class}`~lamindb.Curator` object." ] }, { @@ -109,7 +109,7 @@ "}\n", "\n", "# pass validation criteria\n", - "curate = ln.Curate.from_df(df, categoricals=categoricals)" + "curate = ln.Curator.from_df(df, categoricals=categoricals)" ] }, { @@ -340,7 +340,7 @@ }, "outputs": [], "source": [ - "curate = ln.Curate.from_anndata(\n", + "curate = ln.Curator.from_anndata(\n", " adata, \n", " var_index=bt.Gene.ensembl_gene_id, # validate var.index against Gene.ensembl_gene_id\n", " categoricals=categoricals, \n", @@ -362,6 +362,24 @@ "curate.validate()" ] }, + { + "cell_type": "markdown", + "id": "71a917d1", + "metadata": {}, + "source": [ + "Save the validated genes following the instruction:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e31dabcc", + "metadata": {}, + "outputs": [], + "source": [ + "curate.add_validated_from_var_index()" + ] + }, { "cell_type": "markdown", "id": "614545ea", @@ -421,7 +439,7 @@ }, "outputs": [], "source": [ - "curate = ln.Curate.from_anndata(\n", + "curate = ln.Curator.from_anndata(\n", " adata_validated, \n", " var_index=bt.Gene.ensembl_gene_id, # validate var.index against Gene.ensembl_gene_id\n", " categoricals=categoricals, \n", diff --git a/docs/introduction.ipynb b/docs/introduction.ipynb index e45d05cc9..2a07cb6df 100644 --- a/docs/introduction.ipynb +++ b/docs/introduction.ipynb @@ -626,7 +626,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's use the high-level {class}`~lamindb.Curate` class to curate a `DataFrame`." + "Let's use the high-level {class}`~lamindb.Curator` class to curate a `DataFrame`." ] }, { @@ -639,8 +639,8 @@ }, "outputs": [], "source": [ - "# construct a Curate object to validate & annotate a DataFrame\n", - "curate = ln.Curate.from_df(\n", + "# construct a Curator object to validate & annotate a DataFrame\n", + "curate = ln.Curator.from_df(\n", " df_fixed_typo,\n", " # define validation criteria\n", " columns=ln.Feature.name, # map column names\n", @@ -712,7 +712,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Had we used `ln.Curate` from the beginning, we would have caught the typo." + "Had we used `ln.Cuartor` from the beginning, we would have caught the typo." ] }, { @@ -725,8 +725,8 @@ }, "outputs": [], "source": [ - "# construct a Curate object to validate & annotate a DataFrame\n", - "curate = ln.Curate.from_df(\n", + "# construct a Curator object to validate & annotate a DataFrame\n", + "curate = ln.Curator.from_df(\n", " df_with_typo,\n", " # define validation criteria\n", " columns=ln.Feature.name, # map column names\n", @@ -810,7 +810,7 @@ "adata = ad.AnnData(df_fixed_typo[[\"CD8A\", \"CD4\", \"CD14\"]], obs=df_fixed_typo[[\"perturbation\"]])\n", "\n", "# create an annotation flow for an AnnData object\n", - "curate = ln.Curate.from_anndata(\n", + "curate = ln.Curator.from_anndata(\n", " adata,\n", " # define validation criteria\n", " var_index=bt.Gene.symbol, # map .var.index onto Gene registry\n", @@ -914,12 +914,13 @@ "adata = ad.AnnData(df[[\"CD8A\", \"CD4\", \"CD38\"]], obs=df[[\"perturbation\"]])\n", "\n", "# validate, curate and save a new artifact\n", - "curate = ln.Curate.from_anndata(\n", + "curate = ln.Curator.from_anndata(\n", " adata,\n", " var_index=bt.Gene.symbol,\n", " categoricals={adata.obs.perturbation.name: ln.ULabel.name},\n", " organism=\"human\"\n", ")\n", + "curate.add_validated_from_var_index()\n", "curate.validate()\n", "artifact2 = curate.save_artifact(description=\"my RNA-seq dataset 2\")" ] diff --git a/lamindb/_curate.py b/lamindb/_curate.py index fefb04cfb..c73a84ac9 100644 --- a/lamindb/_curate.py +++ b/lamindb/_curate.py @@ -125,7 +125,7 @@ class DataFrameCurator(BaseCurator): Examples: >>> import bionty as bt - >>> curate = ln.Curate.from_df( + >>> curate = ln.Curator.from_df( ... df, ... categoricals={ ... "cell_type_ontology_id": bt.CellType.ontology_id, @@ -208,6 +208,7 @@ def _save_columns(self, validated_only: bool = True, **kwargs) -> None: using_key=self._using_key, validated_only=False, source=self._sources.get("columns"), + exclude=self._exclude.get("columns"), **kwargs, ) @@ -223,6 +224,7 @@ def _save_columns(self, validated_only: bool = True, **kwargs) -> None: validated_only=validated_only, df=self._df, # Get the Feature type from df source=self._sources.get("columns"), + exclude=self._exclude.get("columns"), warning=False, # Do not warn about missing columns, just an info message **kwargs, ) @@ -275,6 +277,7 @@ def _update_registry(self, categorical: str, validated_only: bool = True, **kwar using_key=self._using_key, validated_only=validated_only, source=self._sources.get(categorical), + exclude=self._exclude.get(categorical), **kwargs, ) @@ -356,7 +359,7 @@ class AnnDataCurator(DataFrameCurator): See also :class:`~lamindb.Curator`. - Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curate.from_anndata`. + Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`. See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version. @@ -372,7 +375,7 @@ class AnnDataCurator(DataFrameCurator): Examples: >>> import bionty as bt - >>> curate = ln.Curate.from_anndata( + >>> curate = ln.Curator.from_anndata( ... adata, ... var_index=bt.Gene.ensembl_gene_id, ... categoricals={ @@ -463,6 +466,7 @@ def _save_from_var_index( validated_only=validated_only, organism=organism, source=self._sources.get("var_index"), + exclude=self._exclude.get("var_index"), ) def _update_registry_all(self, validated_only: bool = True, **kwargs): @@ -565,7 +569,7 @@ class MuDataCurator: See also :class:`~lamindb.Curator`. Note that if genes or other measurements are removed from the MuData object, - the object should be recreated using :meth:`~lamindb.Curate.from_mudata`. + the object should be recreated using :meth:`~lamindb.Curator.from_mudata`. Args: mdata: The MuData object to curate. @@ -582,7 +586,7 @@ class MuDataCurator: Examples: >>> import bionty as bt - >>> curate = ln.Curate.from_mudata( + >>> curate = ln.Curator.from_mudata( ... mdata, ... var_index={ ... "rna": bt.Gene.ensembl_gene_id, @@ -667,6 +671,7 @@ def _save_from_var_index_modality( validated_only=validated_only, dtype="number", source=self._sources.get(modality, {}).get("var_index"), + exclude=self._exclude.get(modality, {}).get("var_index"), **kwargs, ) @@ -730,6 +735,7 @@ def add_new_from_columns( validated_only=False, df=self._mdata[modality].obs, source=self._sources.get(modality, {}).get("columns"), + exclude=self._exclude.get(modality, {}).get("columns"), **self._kwargs, # type: ignore **kwargs, ) @@ -815,7 +821,8 @@ def validate(self, organism: str | None = None) -> bool: field=var_field, key=f"{modality}_var_index", using_key=self._using_key, - exclude=self._exclude.get(f"{modality}_var_index"), + source=self._sources.get(modality, {}).get("var_index"), + exclude=self._exclude.get(modality, {}).get("var_index"), **self._kwargs, # type: ignore ) validated_var &= is_validated_var @@ -882,9 +889,9 @@ class Curator(BaseCurator): 1. Instantiate `Curator` from one of the following dataset objects: - - :meth:`~lamindb.Curate.from_df` - - :meth:`~lamindb.Curate.from_anndata` - - :meth:`~lamindb.Curate.from_mudata` + - :meth:`~lamindb.Curator.from_df` + - :meth:`~lamindb.Curator.from_anndata` + - :meth:`~lamindb.Curator.from_mudata` During object creation, any passed categoricals found in the object will be saved. @@ -1008,10 +1015,22 @@ def standardize_and_inspect( field: FieldAttr, registry: type[Record], standardize: bool = False, + exclude: str | list | None = None, **kwargs, ): """Standardize and inspect values using a registry.""" - filter_kwargs = get_current_filter_kwargs(registry, kwargs) + # inspect exclude values in the default instance + values = list(values) + include_validated = [] + if exclude is not None: + exclude = [exclude] if isinstance(exclude, str) else exclude + exclude = [i for i in exclude if i in values] + if len(exclude) > 0: + # exclude values are validated without source and organism + inspect_result_exclude = registry.inspect(exclude, field=field, mute=True) + # if exclude values are validated, remove them from the values + values = [i for i in values if i not in inspect_result_exclude.validated] + include_validated = inspect_result_exclude.validated if standardize: if hasattr(registry, "standardize") and hasattr( @@ -1019,11 +1038,17 @@ def standardize_and_inspect( "synonyms", # https://github.com/laminlabs/lamindb/issues/1685 ): standardized_values = registry.standardize( - values, field=field, mute=True, **filter_kwargs + values, field=field, mute=True, **kwargs ) values = standardized_values - return registry.inspect(values, field=field, mute=True, **filter_kwargs) + inspect_result = registry.inspect(values, field=field, mute=True, **kwargs) + inspect_result._validated += include_validated + inspect_result._non_validated = [ + i for i in inspect_result.non_validated if i not in include_validated + ] + + return inspect_result def check_registry_organism(registry: Record, organism: str | None = None) -> dict: @@ -1075,35 +1100,32 @@ def _log_mapping_info(): logger.indent = " " registry = field.field.model + kwargs = check_registry_organism(registry, organism) kwargs.update({"source": source} if source else {}) + kwargs_current = get_current_filter_kwargs(registry, kwargs) # inspect the default instance - if exclude is not None: - exclude = [exclude] if isinstance(exclude, str) else exclude - # exclude values are validated without source and organism - inspect_result = registry.inspect(exclude, field=field, mute=True) - # if exclude values are validated, remove them from the values - values = [i for i in values if i not in inspect_result.validated] - inspect_result = standardize_and_inspect( values=values, field=field, registry=registry, standardize=standardize, - **kwargs, + exclude=exclude, + **kwargs_current, ) non_validated = inspect_result.non_validated + # inspect the using instance values_validated = [] if using_key is not None and using_key != "default" and non_validated: registry_using = get_registry_instance(registry, using_key) - # inspect the using instance inspect_result = standardize_and_inspect( values=non_validated, field=field, registry=registry_using, standardize=standardize, + exclude=exclude, **kwargs, ) non_validated = inspect_result.non_validated @@ -1117,7 +1139,7 @@ def _log_mapping_info(): public_records = registry.from_values( non_validated, field=field, - **get_current_filter_kwargs(registry, kwargs), + **kwargs_current, ) values_validated += [getattr(r, field.field.name) for r in public_records] finally: @@ -1137,9 +1159,13 @@ def _log_mapping_info(): non_validated = [i for i in non_validated if i not in values_validated] n_non_validated = len(non_validated) if n_non_validated == 0: - logger.indent = "" - logger.success(f"{key} is validated against {colors.italic(model_field)}") - return True, [] + if n_validated == 0: + logger.indent = "" + logger.success(f"{key} is validated against {colors.italic(model_field)}") + return True, [] + else: + # validated values still need to be saved to the current instance + return False, [] else: are = "are" if n_non_validated > 1 else "is" print_values = _print_values(non_validated) @@ -1164,6 +1190,9 @@ def validate_categories_in_df( **kwargs, ) -> tuple[bool, dict]: """Validate categories in DataFrame columns using LaminDB registries.""" + if not fields: + return True, {} + if sources is None: sources = {} validated = True @@ -1296,6 +1325,7 @@ def update_registry( source: Record | None = None, standardize: bool = True, warning: bool = True, + exclude: str | list | None = None, **kwargs, ) -> None: """Save features or labels records in the default instance from the using_key instance. @@ -1355,7 +1385,8 @@ def update_registry( field=field, registry=registry, standardize=standardize, - **filter_kwargs, + exclude=exclude, + **filter_kwargs_current, ) if not inspect_result_current.non_validated: all_labels = registry.from_values( @@ -1374,6 +1405,7 @@ def update_registry( inspect_result_current.non_validated, field=field, using_key=using_key, + exclude=exclude, **filter_kwargs, ) @@ -1493,6 +1525,7 @@ def update_registry_from_using_instance( field: FieldAttr, using_key: str | None = None, standardize: bool = False, + exclude: str | list | None = None, **kwargs, ) -> tuple[list[str], list[str]]: """Save features or labels records from the using_key instance. @@ -1518,6 +1551,7 @@ def update_registry_from_using_instance( field=field, registry=registry_using, standardize=standardize, + exclude=exclude, **kwargs, ) labels_using = registry_using.filter( diff --git a/pyproject.toml b/pyproject.toml index 419ab91ad..e82152784 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,8 @@ classifiers = [ ] dependencies = [ # Lamin PINNED packages - "lnschema_core==0.73.2", - "lamindb_setup==0.76.6", + "lnschema_core==0.73.3", + "lamindb_setup==0.76.7", "lamin_utils==0.13.4", "lamin_cli==0.16.2", # others diff --git a/sub/lamindb-setup b/sub/lamindb-setup index b782d20a6..9d9ce0d0b 160000 --- a/sub/lamindb-setup +++ b/sub/lamindb-setup @@ -1 +1 @@ -Subproject commit b782d20a620597d2444cfd062467d91775a69c41 +Subproject commit 9d9ce0d0b6e1c888b397be96603b72bb3219b0c6 diff --git a/sub/lnschema-core b/sub/lnschema-core index 891b7c033..5c7240132 160000 --- a/sub/lnschema-core +++ b/sub/lnschema-core @@ -1 +1 @@ -Subproject commit 891b7c033ff3bf693c686006e31819cf7ff46a5b +Subproject commit 5c72401326df22c45b5edac2f21e8a72d32ca99f diff --git a/tests/core/test_curate.py b/tests/core/test_curate.py index 79755c222..ba3bb8b95 100644 --- a/tests/core/test_curate.py +++ b/tests/core/test_curate.py @@ -86,7 +86,7 @@ def mock_transform(): def test_df_annotator(df, categoricals): - curate = ln.Curate.from_df(df, categoricals=categoricals) + curate = ln.Curator.from_df(df, categoricals=categoricals) validated = curate.validate() assert validated is False @@ -115,28 +115,28 @@ def test_custom_using_invalid_field_lookup(curate_lookup): def test_missing_columns(df): with pytest.raises(ValueError) as error: - ln.Curate.from_df(df, categoricals={"missing_column": "some_registry_field"}) + ln.Curator.from_df(df, categoricals={"missing_column": "some_registry_field"}) assert "Columns {'missing_column'} are not found in the data object!" in str( error.value ) def test_additional_args_with_all_key(df, categoricals): - curate = ln.Curate.from_df(df, categoricals=categoricals) + curate = ln.Curator.from_df(df, categoricals=categoricals) with pytest.raises(ValueError) as error: curate.add_new_from("all", extra_arg="not_allowed") assert "Cannot pass additional arguments to 'all' key!" in str(error.value) def test_save_columns_not_defined_in_fields(df, categoricals): - curate = ln.Curate.from_df(df, categoricals=categoricals) + curate = ln.Curator.from_df(df, categoricals=categoricals) with pytest.raises(ValueError) as error: curate._update_registry("nonexistent") assert "Feature nonexistent is not part of the fields!" in str(error.value) def test_unvalidated_data_object(df, categoricals): - curate = ln.Curate.from_df(df, categoricals=categoricals) + curate = ln.Curator.from_df(df, categoricals=categoricals) with pytest.raises(ValidationError) as error: curate.save_artifact() assert "Data object is not validated" in str(error.value) @@ -161,7 +161,7 @@ def test_clean_up_failed_runs(): assert len(ln.Run.filter(transform=mock_transform).all()) == 2 - curate = ln.Curate.from_df(pd.DataFrame()) + curate = ln.Curator.from_df(pd.DataFrame()) curate.clean_up_failed_runs() assert len(ln.Run.filter(transform=mock_transform).all()) == 1 @@ -172,13 +172,14 @@ def test_clean_up_failed_runs(): def test_anndata_annotator(adata, categoricals): - curate = ln.Curate.from_anndata( + curate = ln.Curator.from_anndata( adata, categoricals=categoricals, var_index=bt.Gene.symbol, organism="human", ) curate.add_validated_from("all") + curate.add_validated_from_var_index() curate.add_new_from("donor") validated = curate.validate() assert validated @@ -192,9 +193,20 @@ def test_anndata_annotator(adata, categoricals): bt.CellType.filter().all().delete() +def test_no_categoricals(adata): + curate = ln.Curate.from_anndata( + adata, + var_index=bt.Gene.symbol, + organism="human", + ) + curate.add_validated_from("all") + validated = curate.validate() + assert validated + + def test_anndata_annotator_wrong_type(df, categoricals): with pytest.raises(ValueError) as error: - ln.Curate.from_anndata( + ln.Curator.from_anndata( df, categoricals=categoricals, var_index=bt.Gene.symbol, @@ -204,7 +216,7 @@ def test_anndata_annotator_wrong_type(df, categoricals): def test_unvalidated_adata_object(adata, categoricals): - curate = ln.Curate.from_anndata( + curate = ln.Curator.from_anndata( adata, categoricals=categoricals, var_index=bt.Gene.symbol, @@ -225,7 +237,7 @@ def test_mudata_annotator(mdata): "rna_2:donor": ln.ULabel.name, } - curate = ln.Curate.from_mudata( + curate = ln.Curator.from_mudata( mdata, categoricals=categoricals, var_index={"rna": bt.Gene.symbol, "rna_2": bt.Gene.symbol},