From 8919301613259ed6b69b832ba87ff5ded2f0f945 Mon Sep 17 00:00:00 2001 From: Koncopd Date: Thu, 28 Nov 2024 18:29:01 +0100 Subject: [PATCH] SOMACurator initial --- lamindb/_curate.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/lamindb/_curate.py b/lamindb/_curate.py index 318604a2c..1bb6d1e43 100644 --- a/lamindb/_curate.py +++ b/lamindb/_curate.py @@ -9,6 +9,7 @@ import pandas as pd from lamin_utils import colors, logger from lamindb_setup.core._docs import doc_args +from lamindb_setup.core.upath import UPath from lnschema_core import ( Artifact, Feature, @@ -1057,6 +1058,59 @@ def save_artifact( return self._artifact +class SOMACurator(BaseCurator): + def __init__( + self, + experiment_uri: UPathStr, + var_index: dict[str, FieldAttr], + categoricals: dict[str, FieldAttr] | None = None, + using_key: str | None = None, + verbosity: str = "hint", + organism: str | None = None, + sources: dict[str, Record] | None = None, + exclude: dict | None = None, # {modality: {field: [values]}} + ): + self._experiment_uri = UPath(experiment_uri) + self.obs_fields = categoricals or {} + self._using_key = using_key + self._sources = sources or {} + self._exclude = exclude or {} + + def validate(self): + from pyarrow.compute import unique + + from lamindb.core.storage._tiledbsoma import _open_tiledbsoma + + validated = True + self._non_validated = {} + with _open_tiledbsoma(self._experiment_uri) as experiment: + obs = experiment["obs"] + for key, field in self.obs_fields.items(): + values = unique(obs.read(column_names=[key]).concat()[key]).to_pylist() + update_registry( + values=values, + field=field, + key=key, + using_key=self._using_key, + validated_only=True, + source=self._sources.get(key), + exclude=self._exclude.get(key), + ) + is_val, non_val = validate_categories( + values=values, + field=field, + key=key, + using_key=self._using_key, + source=self._sources.get(key), + exclude=self._exclude.get(key), + ) + validated &= is_val + if len(non_val) > 0: + self._non_validated[key] = non_val + self._validated = validated + return self._validated + + class Curator(BaseCurator): """Dataset curator.