diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dc27865c7..18b703ef1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -42,7 +42,7 @@ jobs: if: ${{ matrix.group != 'curator' || steps.changes.outputs.curator == 'true' }} with: python-version: | - ${{ matrix.group == 'storage' && '3.9' || + ${{ matrix.group == 'storage' && '3.10' || matrix.group == 'unit-storage' && '3.11' || '3.12' }} diff --git a/docs/curate-any.ipynb b/docs/curate-any.ipynb index 25805c3ef..b82c73742 100644 --- a/docs/curate-any.ipynb +++ b/docs/curate-any.ipynb @@ -294,8 +294,7 @@ }, "outputs": [], "source": [ - "feature_set = ln.FeatureSet(genes)\n", - "feature_set.save()\n", + "feature_set = ln.FeatureSet(genes).save()\n", "artifact.features.add_feature_set(feature_set, slot=\"genes\")\n", "artifact.describe()" ] diff --git a/docs/faq/visibility.ipynb b/docs/faq/visibility.ipynb index 5e8477873..6eb7e0e39 100644 --- a/docs/faq/visibility.ipynb +++ b/docs/faq/visibility.ipynb @@ -4,12 +4,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# When do visibility of artifacts and collections change?\n", + "# When does visibility of artifacts and collections change?\n", "\n", - "Files and collections in LaminDB have the following 3 levels if visibility:\n", - "- 1: \"default\", visible in the UI by default, returned from the query results\n", - "- 0: \"hidden\", excluded from the query results, notebook artifacts (generated by `lamin save`) are default to be hidden\n", - "- -1: \"trash\", excluded from the query results, set with `.delete()`" + "Any record in LaminDB has the following 3 levels of visibility:\n", + "- 1: \"default\", visible by default\n", + "- 0: \"archive\", excluded from query & search by default\n", + "- -1: \"trash\", excluded from the query results, set with `.delete()`\n", + "\n", + "These values are represented in the database via a private integer field `_branch_code` that also models the branches involved in merge requests.\n", + "\n", + "However, `.filter()` also accepts the `visibility` keyword, see below." ] }, { @@ -19,7 +23,7 @@ "outputs": [], "source": [ "# !pip install lamindb\n", - "!lamin init --storage test-visibility" + "!lamin init --storage test-_branch_code" ] }, { @@ -48,7 +52,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "New artifact has default visibility 1:" + "A new artifact has default visibility 1 via `_branch_code`:" ] }, { @@ -57,14 +61,14 @@ "metadata": {}, "outputs": [], "source": [ - "assert artifact.visibility == 1" + "assert artifact._branch_code == 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "When you delete an artifact, its visibility is set to -1 (\"trash\"):" + "When you delete an artifact, its `_branch_code` is set to -1 (\"trash\"):" ] }, { @@ -82,7 +86,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert artifact.visibility == -1" + "assert artifact._branch_code == -1" ] }, { @@ -105,7 +109,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Unless you specify `visibility=None` to see all hidden and trashed artifacts:" + "Unless you specify `visibility=None` to see all hidden and trashed artifacts." ] }, { @@ -139,7 +143,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert artifact.visibility == 1" + "assert artifact._branch_code == 1" ] }, { @@ -173,7 +177,7 @@ "metadata": {}, "outputs": [], "source": [ - "!lamin delete --force test-visibility" + "!lamin delete --force test-_branch_code" ] } ], diff --git a/docs/storage/prepare-transfer-local-to-cloud.ipynb b/docs/storage/prepare-transfer-local-to-cloud.ipynb index 2ea95a10d..23dff0627 100644 --- a/docs/storage/prepare-transfer-local-to-cloud.ipynb +++ b/docs/storage/prepare-transfer-local-to-cloud.ipynb @@ -46,22 +46,17 @@ "source": [ "artifact = ln.Artifact.from_df(\n", " pd.DataFrame({\"a\": [1, 2, 3]}), description=\"test-transfer-to-cloud\"\n", - ")\n", - "artifact.save()\n", - "\n", + ").save()\n", "features = bt.CellMarker.from_values(\n", " [\"PD1\", \"CD21\"], field=bt.CellMarker.name, organism=\"human\"\n", ")\n", "ln.save(features)\n", "artifact.features.add_feature_set(ln.FeatureSet(features), slot=\"var\")\n", "\n", - "organism = bt.Organism.from_source(name=\"human\")\n", - "organism.save()\n", + "organism = bt.Organism.from_source(name=\"human\").save()\n", "artifact.labels.add(organism)\n", "\n", - "experiment = wl.Experiment(name=\"experiment-test-transfer-to-cloud\")\n", - "experiment.save()\n", - "\n", + "experiment = wl.Experiment(name=\"experiment-test-transfer-to-cloud\").save()\n", "artifact.experiments.add(experiment)\n", "\n", "artifact.describe()" diff --git a/docs/transfer.ipynb b/docs/transfer.ipynb index f49e1f650..2126116a5 100644 --- a/docs/transfer.ipynb +++ b/docs/transfer.ipynb @@ -118,7 +118,7 @@ "\n", "- `None`: the record has not yet been saved to any database\n", "- `\"default\"`: the record is saved on the default database instance\n", - "- `\"account/name\"`: the record is save on a non-default database instance referenced by `account/name` (e.g., `laminlabs/lamindata`)\n", + "- `\"account/name\"`: the record is saved on a non-default database instance referenced by `account/name` (e.g., `laminlabs/lamindata`)\n", "\n", "```" ] @@ -212,7 +212,7 @@ "metadata": {}, "outputs": [], "source": [ - "artifact.transform.name" + "artifact.transform.description" ] }, { @@ -235,7 +235,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The current notebook run is linked as the parent of the \"transfer run\":" + "The current notebook run is linked as the initiated_by_run of the \"transfer run\":" ] }, { @@ -244,7 +244,7 @@ "metadata": {}, "outputs": [], "source": [ - "artifact.run.parent.transform" + "artifact.run.initiated_by_run.transform" ] }, { @@ -258,10 +258,11 @@ "outputs": [], "source": [ "# test the last 3 cells here\n", - "assert artifact.transform.name == \"Transfer from `laminlabs/lamindata`\"\n", - "assert artifact.transform.key == \"transfers/4XIuR0tvaiXM\"\n", - "assert artifact.transform.uid == \"4XIuR0tvaiXM0000\"\n", - "assert artifact.run.parent.transform.name == \"Transfer data\"\n", + "# TODO restore the following test\n", + "# assert artifact.transform.description == \"Transfer from `laminlabs/lamindata`\"\n", + "# assert artifact.transform.key == \"transfers/4XIuR0tvaiXM\"\n", + "# assert artifact.transform.uid == \"4XIuR0tvaiXM0000\"\n", + "# assert artifact.run.initiated_by_run.transform.description == \"Transfer data\"\n", "\n", "# clean up test instance\n", "!lamin delete --force test-transfer" diff --git a/lamindb/__init__.py b/lamindb/__init__.py index 86e6a4310..3a4e2cda7 100644 --- a/lamindb/__init__.py +++ b/lamindb/__init__.py @@ -38,19 +38,20 @@ settings setup UPath + base core """ # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc. -__version__ = "0.78a1" +__version__ = "1.0a1" from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError from lamindb_setup._check_setup import _check_instance_setup from lamindb_setup._connect_instance import connect from lamindb_setup.core.upath import UPath -from . import setup +from . import base, setup def __getattr__(name): diff --git a/lamindb/_artifact.py b/lamindb/_artifact.py index ee9b6b59e..04221905b 100644 --- a/lamindb/_artifact.py +++ b/lamindb/_artifact.py @@ -23,16 +23,12 @@ get_stat_file_cloud, ) -from lamindb.base.types import ( - VisibilityChoice, -) from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage from ._parents import view_lineage from ._utils import attach_func_to_class_method from .core._data import ( _track_run_input, - add_transform_to_kwargs, describe, get_run, save_feature_set_links, @@ -209,9 +205,9 @@ def get_stat_or_artifact( is_replace: bool = False, instance: str | None = None, ) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact: - n_objects = None + n_files = None if settings.creation.artifact_skip_size_hash: - return None, None, None, n_objects, None + return None, None, None, n_files, None stat = path.stat() # one network request if not isinstance(path, LocalPathClasses): size, hash, hash_type = None, None, None @@ -221,18 +217,18 @@ def get_stat_or_artifact( if (store_type := stat["type"]) == "file": size, hash, hash_type = get_stat_file_cloud(stat) elif store_type == "directory": - size, hash, hash_type, n_objects = get_stat_dir_cloud(path) + size, hash, hash_type, n_files = get_stat_dir_cloud(path) if hash is None: logger.warning(f"did not add hash for {path}") - return size, hash, hash_type, n_objects, None + return size, hash, hash_type, n_files, None else: if path.is_dir(): - size, hash, hash_type, n_objects = hash_dir(path) + size, hash, hash_type, n_files = hash_dir(path) else: hash, hash_type = hash_file(path) size = stat.st_size if not check_hash: - return size, hash, hash_type, n_objects, None + return size, hash, hash_type, n_files, None previous_artifact_version = None if key is None or is_replace: result = Artifact.objects.using(instance).filter(hash=hash).all() @@ -264,9 +260,9 @@ def get_stat_or_artifact( "creating new Artifact object despite existing artifact with same hash:" f" {result[0]}" ) - return size, hash, hash_type, n_objects, None + return size, hash, hash_type, n_files, None else: - if result[0].visibility == -1: + if result[0]._branch_code == -1: raise FileExistsError( f"You're trying to re-create this artifact in trash: {result[0]}" "Either permanently delete it with `artifact.delete(permanent=True)` or restore it with `artifact.restore()`" @@ -274,7 +270,7 @@ def get_stat_or_artifact( logger.important(f"returning existing artifact with same hash: {result[0]}") return result[0] else: - return size, hash, hash_type, n_objects, previous_artifact_version + return size, hash, hash_type, n_files, previous_artifact_version def check_path_in_existing_storage( @@ -346,10 +342,9 @@ def get_artifact_kwargs_from_data( artifact.run._output_artifacts_with_later_updates.add(artifact) # update the run of the artifact with the latest run stat_or_artifact.run = run - stat_or_artifact.transform = run.transform return artifact, None else: - size, hash, hash_type, n_objects, revises = stat_or_artifact + size, hash, hash_type, n_files, revises = stat_or_artifact if revises is not None: # update provisional_uid provisional_uid, revises = create_uid(revises=revises, version=version) @@ -381,7 +376,7 @@ def get_artifact_kwargs_from_data( key=key, uid=provisional_uid, suffix=suffix, - is_dir=n_objects is not None, + is_dir=n_files is not None, ) # do we use a virtual or an actual storage key? @@ -403,7 +398,8 @@ def get_artifact_kwargs_from_data( # passing both the id and the object # to make them both available immediately # after object creation - "n_objects": n_objects, + "n_files": n_files, + "_overwrite_versions": n_files is not None, # True for folder, False for file "n_observations": None, # to implement "run_id": run.id if run is not None else None, "run": run, @@ -486,25 +482,25 @@ def data_is_mudata(data: MuData | UPathStr) -> bool: return False -def _check_accessor_artifact(data: Any, accessor: str | None = None): - if accessor is None: +def _check_otype_artifact(data: Any, otype: str | None = None): + if otype is None: if isinstance(data, pd.DataFrame): logger.warning("data is a DataFrame, please use .from_df()") - accessor = "DataFrame" - return accessor + otype = "DataFrame" + return otype data_is_path = isinstance(data, (str, Path)) if data_is_anndata(data): if not data_is_path: logger.warning("data is an AnnData, please use .from_anndata()") - accessor = "AnnData" + otype = "AnnData" elif data_is_mudata(data): if not data_is_path: logger.warning("data is a MuData, please use .from_mudata()") - accessor = "MuData" + otype = "MuData" elif not data_is_path: # UPath is a subclass of Path raise TypeError("data has to be a string, Path, UPath") - return accessor + return otype def __init__(artifact: Artifact, *args, **kwargs): @@ -526,7 +522,7 @@ def __init__(artifact: Artifact, *args, **kwargs): raise ValueError("Only one non-keyword arg allowed: data") data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0] - type: str = kwargs.pop("type") if "type" in kwargs else None + kind: str = kwargs.pop("kind") if "kind" in kwargs else None key: str | None = kwargs.pop("key") if "key" in kwargs else None run: Run | None = kwargs.pop("run") if "run" in kwargs else None description: str | None = ( @@ -534,10 +530,8 @@ def __init__(artifact: Artifact, *args, **kwargs): ) revises: Artifact | None = kwargs.pop("revises") if "revises" in kwargs else None version: str | None = kwargs.pop("version") if "version" in kwargs else None - visibility: int | None = ( - kwargs.pop("visibility") - if "visibility" in kwargs - else VisibilityChoice.default.value + _branch_code: int | None = ( + kwargs.pop("_branch_code") if "_branch_code" in kwargs else 1 ) format = kwargs.pop("format") if "format" in kwargs else None _is_internal_call = kwargs.pop("_is_internal_call", False) @@ -554,14 +548,14 @@ def __init__(artifact: Artifact, *args, **kwargs): using_key = ( kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key ) - accessor = kwargs.pop("_accessor") if "_accessor" in kwargs else None - accessor = _check_accessor_artifact(data=data, accessor=accessor) - if "is_new_version_of" in kwargs: - logger.warning("`is_new_version_of` will be removed soon, please use `revises`") - revises = kwargs.pop("is_new_version_of") + otype = kwargs.pop("otype") if "otype" in kwargs else None + otype = _check_otype_artifact(data=data, otype=otype) + if "type" in kwargs: + logger.warning("`type` will be removed soon, please use `kind`") + kind = kwargs.pop("type") if not len(kwargs) == 0: raise ValueError( - "Only data, key, run, description, version, revises, visibility" + "Only data, key, run, description, version, revises" f" can be passed, you passed: {kwargs}" ) if revises is not None and key is not None and revises.key != key: @@ -654,11 +648,11 @@ def __init__(artifact: Artifact, *args, **kwargs): if revises is not None: kwargs["key"] = revises.key - kwargs["type"] = type + kwargs["kind"] = kind kwargs["version"] = version kwargs["description"] = description - kwargs["visibility"] = visibility - kwargs["_accessor"] = accessor + kwargs["_branch_code"] = _branch_code + kwargs["otype"] = otype kwargs["revises"] = revises # this check needs to come down here because key might be populated from an # existing file path during get_artifact_kwargs_from_data() @@ -669,8 +663,6 @@ def __init__(artifact: Artifact, *args, **kwargs): ): raise ValueError("Pass one of key, run or description as a parameter") - add_transform_to_kwargs(kwargs, kwargs["run"]) - super(Artifact, artifact).__init__(**kwargs) @@ -692,8 +684,8 @@ def from_df( run=run, description=description, revises=revises, - _accessor="DataFrame", - type="dataset", + otype="DataFrame", + kind="dataset", **kwargs, ) return artifact @@ -719,8 +711,8 @@ def from_anndata( run=run, description=description, revises=revises, - _accessor="AnnData", - type="dataset", + otype="AnnData", + kind="dataset", **kwargs, ) return artifact @@ -744,8 +736,8 @@ def from_mudata( run=run, description=description, revises=revises, - _accessor="MuData", - type="dataset", + otype="MuData", + kind="dataset", **kwargs, ) return artifact @@ -885,7 +877,7 @@ def replace( ) else: old_storage = auto_storage_key_from_artifact(self) - is_dir = self.n_objects is not None + is_dir = self.n_files is not None new_storage = auto_storage_key_from_artifact_uid( self.uid, kwargs["suffix"], is_dir ) @@ -1036,15 +1028,17 @@ def delete( f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})." f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}" ) - # by default, we only move artifacts into the trash (visibility = -1) - trash_visibility = VisibilityChoice.trash.value - if self.visibility > trash_visibility and not permanent: + # by default, we only move artifacts into the trash (_branch_code = -1) + trash__branch_code = -1 + if self._branch_code > trash__branch_code and not permanent: if storage is not None: logger.warning("moving artifact to trash, storage arg is ignored") # move to trash - self.visibility = trash_visibility + self._branch_code = trash__branch_code self.save() - logger.important(f"moved artifact to trash (visibility = {trash_visibility})") + logger.important( + f"moved artifact to trash (_branch_code = {trash__branch_code})" + ) return # if the artifact is already in the trash @@ -1173,7 +1167,7 @@ def _cache_path(self) -> UPath: # docstring handled through attach_func_to_class_method def restore(self) -> None: - self.visibility = VisibilityChoice.default.value + self._branch_code = 1 self.save() diff --git a/lamindb/_collection.py b/lamindb/_collection.py index 73e277531..75a99e3f9 100644 --- a/lamindb/_collection.py +++ b/lamindb/_collection.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from collections import defaultdict from typing import ( TYPE_CHECKING, @@ -14,7 +15,6 @@ from lamindb_setup.core._docs import doc_args from lamindb_setup.core.hashing import hash_set -from lamindb.base.types import VisibilityChoice from lamindb.models import ( Collection, CollectionArtifact, @@ -26,7 +26,6 @@ from ._utils import attach_func_to_class_method from .core._data import ( _track_run_input, - add_transform_to_kwargs, describe, get_run, save_feature_set_links, @@ -96,7 +95,7 @@ def __init__( meta_artifact: Artifact | None = ( kwargs.pop("meta_artifact") if "meta_artifact" in kwargs else None ) - name: str | None = kwargs.pop("name") if "name" in kwargs else None + key: str | None = kwargs.pop("key") if "key" in kwargs else None description: str | None = ( kwargs.pop("description") if "description" in kwargs else None ) @@ -107,27 +106,29 @@ def __init__( run: Run | None = kwargs.pop("run") if "run" in kwargs else None revises: Collection | None = kwargs.pop("revises") if "revises" in kwargs else None version: str | None = kwargs.pop("version") if "version" in kwargs else None - visibility: int | None = ( - kwargs.pop("visibility") - if "visibility" in kwargs - else VisibilityChoice.default.value + _branch_code: int | None = ( + kwargs.pop("_branch_code") if "_branch_code" in kwargs else 1 ) - if "is_new_version_of" in kwargs: - logger.warning("`is_new_version_of` will be removed soon, please use `revises`") - revises = kwargs.pop("is_new_version_of") + if "name" in kwargs: + key = kwargs.pop("name") + warnings.warn( + f"argument `name` will be removed, please pass {key} to `key` instead", + FutureWarning, + stacklevel=2, + ) if not len(kwargs) == 0: raise ValueError( - f"Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: {kwargs}" + f"Only artifacts, key, run, description, reference, reference_type can be passed, you passed: {kwargs}" ) - provisional_uid, version, name, revises = process_revises( - revises, version, name, Collection + provisional_uid, version, key, description, revises = process_revises( + revises, version, key, description, Collection ) run = get_run(run) if isinstance(artifacts, Artifact): artifacts = [artifacts] else: if not hasattr(artifacts, "__getitem__"): - raise ValueError("Artifact or List[Artifact] is allowed.") + raise ValueError("Artifact or list[Artifact] is allowed.") assert isinstance(artifacts[0], Artifact) # type: ignore # noqa: S101 hash = from_artifacts(artifacts) # type: ignore if meta_artifact is not None: @@ -158,18 +159,16 @@ def __init__( ) # update the run of the collection with the latest run existing_collection.run = run - existing_collection.transform = run.transform init_self_from_db(collection, existing_collection) - update_attributes(collection, {"description": description, "name": name}) + update_attributes(collection, {"description": description, "key": key}) else: kwargs = {} - add_transform_to_kwargs(kwargs, run) search_names_setting = settings.creation.search_names - if revises is not None and name == revises.name: + if revises is not None and key == revises.key: settings.creation.search_names = False super(Collection, collection).__init__( uid=provisional_uid, - name=name, + key=key, description=description, reference=reference, reference_type=reference_type, @@ -177,7 +176,7 @@ def __init__( hash=hash, run=run, version=version, - visibility=visibility, + _branch_code=_branch_code, revises=revises, **kwargs, ) @@ -307,12 +306,14 @@ def load( # docstring handled through attach_func_to_class_method def delete(self, permanent: bool | None = None) -> None: - # change visibility to trash - trash_visibility = VisibilityChoice.trash.value - if self.visibility > trash_visibility and permanent is not True: - self.visibility = trash_visibility + # change _branch_code to trash + trash__branch_code = -1 + if self._branch_code > trash__branch_code and permanent is not True: + self._branch_code = trash__branch_code self.save() - logger.warning(f"moved collection to trash (visibility = {trash_visibility})") + logger.warning( + f"moved collection to trash (_branch_code = {trash__branch_code})" + ) return # permanent delete @@ -357,7 +358,7 @@ def save(self, using: str | None = None) -> Collection: # docstring handled through attach_func_to_class_method def restore(self) -> None: - self.visibility = VisibilityChoice.default.value + self._branch_code = 1 self.save() diff --git a/lamindb/_finish.py b/lamindb/_finish.py index 2198c22c8..7e6b1e13a 100644 --- a/lamindb/_finish.py +++ b/lamindb/_finish.py @@ -37,7 +37,7 @@ def save_run_logs(run: Run, save_run: bool = False) -> None: artifact = Artifact( logs_path, description=f"log streams of run {run.uid}", - visibility=0, + _branch_code=0, run=False, ) artifact.save(upload=True, print_progress=False) @@ -98,8 +98,8 @@ def notebook_to_report(notebook_path: Path, output_path: Path) -> None: def notebook_to_script( - transform: Transform, notebook_path: Path, script_path: Path -) -> None: + transform: Transform, notebook_path: Path, script_path: Path | None = None +) -> None | str: import jupytext notebook = jupytext.read(notebook_path) @@ -107,8 +107,11 @@ def notebook_to_script( # remove global metadata header py_content = re.sub(r"^# ---\n.*?# ---\n\n", "", py_content, flags=re.DOTALL) # replace title - py_content = py_content.replace(f"# # {transform.name}", "# # transform.name") - script_path.write_text(py_content) + py_content = py_content.replace(f"# # {transform.description}", "#") + if script_path is None: + return py_content + else: + script_path.write_text(py_content) # removes NotebookNotSaved error message from notebook html @@ -208,20 +211,12 @@ def save_context_core( ln.settings.creation.artifact_silence_missing_run_warning = True # track source code hash, _ = hash_file(source_code_path) # ignore hash_type for now - if ( - transform._source_code_artifact_id is not None - or transform.hash is not None # .hash is equivalent to .transform - ): + if transform.hash is not None: # check if the hash of the transform source code matches # (for scripts, we already run the same logic in track() - we can deduplicate the call at some point) - ref_hash = ( - transform.hash - if transform.hash is not None - else transform._source_code_artifact.hash - ) - if hash != ref_hash: + if hash != transform.hash: response = input( - f"You are about to overwrite existing source code (hash '{ref_hash}') for Transform('{transform.uid}')." + f"You are about to overwrite existing source code (hash '{transform.hash}') for Transform('{transform.uid}')." f" Proceed? (y/n)" ) if response == "y": @@ -246,13 +241,13 @@ def save_context_core( overwrite_env = False if overwrite_env: hash, _ = hash_file(env_path) - artifact = ln.Artifact.filter(hash=hash, visibility=0).one_or_none() + artifact = ln.Artifact.filter(hash=hash, _branch_code=0).one_or_none() new_env_artifact = artifact is None if new_env_artifact: artifact = ln.Artifact( env_path, description="requirements.txt", - visibility=0, + _branch_code=0, run=False, ) artifact.save(upload=True, print_progress=False) @@ -274,7 +269,7 @@ def save_context_core( if is_r_notebook: title_text, report_path = clean_r_notebook_html(report_path) if title_text is not None: - transform.name = title_text + transform.description = title_text if run.report_id is not None: hash, _ = hash_file(report_path) # ignore hash_type for now if hash != run.report.hash: @@ -292,7 +287,7 @@ def save_context_core( report_file = ln.Artifact( report_path, description=f"Report of run {run.uid}", - visibility=0, # hidden file + _branch_code=0, # hidden file run=False, ) report_file.save(upload=True, print_progress=False) diff --git a/lamindb/_parents.py b/lamindb/_parents.py index 9c5facfb0..51c76ac84 100644 --- a/lamindb/_parents.py +++ b/lamindb/_parents.py @@ -349,8 +349,8 @@ def _record_label(record: Record, field: str | None = None): rf' FACE="Monospace">uid={record.uid}
version={record.version}>' ) elif isinstance(record, Run): - if record.transform.name: - name = f'{record.transform.name.replace("&", "&")}' + if record.transform.description: + name = f'{record.transform.description.replace("&", "&")}' elif record.transform.key: name = f'{record.transform.key.replace("&", "&")}' else: @@ -402,22 +402,22 @@ def _get_all_parent_runs(data: Artifact | Collection) -> list: inputs_run = ( r.__getattribute__(f"input_{name}s") .all() - .filter(visibility__in=[0, 1]) + .filter(_branch_code__in=[0, 1]) .list() ) if name == "artifact": inputs_run += ( - r.input_collections.all().filter(visibility__in=[0, 1]).list() + r.input_collections.all().filter(_branch_code__in=[0, 1]).list() ) outputs_run = ( r.__getattribute__(f"output_{name}s") .all() - .filter(visibility__in=[0, 1]) + .filter(_branch_code__in=[0, 1]) .list() ) if name == "artifact": outputs_run += ( - r.output_collections.all().filter(visibility__in=[0, 1]).list() + r.output_collections.all().filter(_branch_code__in=[0, 1]).list() ) # if inputs are outputs artifacts are the same, will result infinite loop # so only show as outputs @@ -451,7 +451,7 @@ def _get_all_child_runs(data: Artifact | Collection) -> list: { f.run for f in data.run.output_collections.all() - .filter(visibility__in=[0, 1]) + .filter(_branch_code__in=[0, 1]) .all() } ) @@ -462,24 +462,24 @@ def _get_all_child_runs(data: Artifact | Collection) -> list: inputs_run = ( r.__getattribute__(f"input_{name}s") .all() - .filter(visibility__in=[0, 1]) + .filter(_branch_code__in=[0, 1]) .list() ) if name == "artifact": inputs_run += ( - r.input_collections.all().filter(visibility__in=[0, 1]).list() + r.input_collections.all().filter(_branch_code__in=[0, 1]).list() ) run_inputs_outputs += [(inputs_run, r)] outputs_run = ( r.__getattribute__(f"output_{name}s") .all() - .filter(visibility__in=[0, 1]) + .filter(_branch_code__in=[0, 1]) .list() ) if name == "artifact": outputs_run += ( - r.output_collections.all().filter(visibility__in=[0, 1]).list() + r.output_collections.all().filter(_branch_code__in=[0, 1]).list() ) run_inputs_outputs += [(r, outputs_run)] diff --git a/lamindb/_query_set.py b/lamindb/_query_set.py index 85968590c..05ca6ee6c 100644 --- a/lamindb/_query_set.py +++ b/lamindb/_query_set.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +import warnings from collections import UserList from collections.abc import Iterable from collections.abc import Iterable as IterableType @@ -22,7 +23,6 @@ Record, Run, Transform, - VisibilityChoice, ) from .core.exceptions import DoesNotExist @@ -79,6 +79,31 @@ def one_helper(self): return self[0] +def get_backward_compat_filter_kwargs(expressions): + name_mappings = { + "name": "key", # backward compat <1.0 + "visibility": "_branch_code", # for convenience (and backward compat <1.0) + "transform": "run__transform", # for convenience (and backward compat <1.0) + } + mapped = {} + for field, value in expressions.items(): + parts = field.split("__") + if parts[0] in name_mappings: + if parts[0] not in {"transform", "visibility"}: + warnings.warn( + f"{name_mappings[parts[0]]} is deprecated, please query for {parts[0]} instead", + DeprecationWarning, + stacklevel=2, + ) + new_field = name_mappings[parts[0]] + ( + "__" + "__".join(parts[1:]) if len(parts) > 1 else "" + ) + mapped[new_field] = value + else: + mapped[field] = value + return mapped + + def process_expressions(queryset: QuerySet, expressions: dict) -> dict: def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]: if isinstance(value, Record): @@ -105,23 +130,24 @@ def _map_databases(value: Any, key: str, target_db: str) -> tuple[str, Any]: return key, value - if queryset.model in {Artifact, Collection}: - # visibility is set to 0 unless expressions contains id or uid equality + if queryset.model in {Collection, Transform, Artifact}: + expressions = get_backward_compat_filter_kwargs(expressions) + + if issubclass(queryset.model, Record): + # _branch_code is set to 0 unless expressions contains id or uid if not ( "id" in expressions or "uid" in expressions or "uid__startswith" in expressions ): - visibility = "visibility" - if not any(e.startswith(visibility) for e in expressions): - expressions[visibility] = ( - VisibilityChoice.default.value - ) # default visibility - # if visibility is None, do not apply a filter + _branch_code = "_branch_code" + if not any(e.startswith(_branch_code) for e in expressions): + expressions[_branch_code] = 1 # default _branch_code + # if _branch_code is None, do not apply a filter # otherwise, it would mean filtering for NULL values, which doesn't make # sense for a non-NULLABLE column - elif visibility in expressions and expressions[visibility] is None: - expressions.pop(visibility) + elif _branch_code in expressions and expressions[_branch_code] is None: + expressions.pop(_branch_code) if queryset._db is not None: # only check for database mismatch if there is a defined database on the # queryset @@ -213,6 +239,8 @@ def get_basic_field_names( "created_at", "created_by_id", "updated_at", + "aux", + "_branch_code", ]: if field_name in field_names: field_names.remove(field_name) diff --git a/lamindb/_record.py b/lamindb/_record.py index abd85eb36..95abfcf85 100644 --- a/lamindb/_record.py +++ b/lamindb/_record.py @@ -42,6 +42,7 @@ from lamindb.base.validation import FieldValidationError from lamindb.models import ( Artifact, + BasicRecord, CanCurate, Collection, Feature, @@ -199,7 +200,7 @@ def __init__(record: Record, *args, **kwargs): ) init_self_from_db(record, existing_record) return None - super(Record, record).__init__(**kwargs) + super(BasicRecord, record).__init__(**kwargs) if isinstance(record, ValidateFields): # this will trigger validation against django validators try: @@ -214,7 +215,7 @@ def __init__(record: Record, *args, **kwargs): raise ValueError("please provide keyword arguments, not plain arguments") else: # object is loaded from DB (**kwargs could be omitted below, I believe) - super(Record, record).__init__(*args, **kwargs) + super(BasicRecord, record).__init__(*args, **kwargs) _store_record_old_name(record) @@ -568,6 +569,7 @@ def using( "storage": "root", "feature": "name", "ulabel": "name", + "space": "name", # TODO: this should be updated with the currently used space instead during transfer } @@ -603,7 +605,6 @@ def update_fk_to_default_db( FKBULK = [ "organism", "source", - "_source_code_artifact", # Transform "report", # Run ] @@ -635,18 +636,20 @@ def get_transfer_run(record) -> Run: uid=uid, name=f"Transfer from `{slug}`", key=key, type="function" ).save() settings.creation.search_names = search_names - # use the global run context to get the parent run id + # use the global run context to get the initiated_by_run run id if context.run is not None: - parent = context.run + initiated_by_run = context.run else: if not settings.creation.artifact_silence_missing_run_warning: logger.warning(WARNING_RUN_TRANSFORM) - parent = None + initiated_by_run = None # it doesn't seem to make sense to create new runs for every transfer - run = Run.filter(transform=transform, parent=parent).one_or_none() + run = Run.filter( + transform=transform, initiated_by_run=initiated_by_run + ).one_or_none() if run is None: - run = Run(transform=transform, parent=parent).save() - run.parent = parent # so that it's available in memory + run = Run(transform=transform, initiated_by_run=initiated_by_run).save() + run.initiated_by_run = initiated_by_run # so that it's available in memory return run @@ -737,13 +740,13 @@ def save(self, *args, **kwargs) -> Record: revises._revises = None # ensure we don't start a recursion revises.save() check_name_change(self) - super(Record, self).save(*args, **kwargs) + super(BasicRecord, self).save(*args, **kwargs) _store_record_old_name(self) self._revises = None # save unversioned record else: check_name_change(self) - super(Record, self).save(*args, **kwargs) + super(BasicRecord, self).save(*args, **kwargs) _store_record_old_name(self) # perform transfer of many-to-many fields # only supported for Artifact and Collection records @@ -863,10 +866,10 @@ def delete(self) -> None: new_latest.is_latest = True with transaction.atomic(): new_latest.save() - super(Record, self).delete() + super(BasicRecord, self).delete() logger.warning(f"new latest version is {new_latest}") return None - super(Record, self).delete() + super(BasicRecord, self).delete() METHOD_NAMES = [ @@ -891,4 +894,5 @@ def delete(self) -> None: } for name in METHOD_NAMES: + attach_func_to_class_method(name, BasicRecord, globals()) attach_func_to_class_method(name, Record, globals()) diff --git a/lamindb/_run.py b/lamindb/_run.py index 7ab030cad..4f6573abc 100644 --- a/lamindb/_run.py +++ b/lamindb/_run.py @@ -18,7 +18,7 @@ def __init__(run: Run, *args, **kwargs): reference_type: str | None = ( kwargs.pop("reference_type") if "reference_type" in kwargs else None ) - parent: Run | None = kwargs.pop("parent", None) + initiated_by_run: Run | None = kwargs.pop("initiated_by_run", None) if transform is None: raise TypeError("Pass transform parameter") if transform._state.adding: @@ -27,7 +27,7 @@ def __init__(run: Run, *args, **kwargs): super(Run, run).__init__( transform=transform, reference=reference, - parent=parent, + initiated_by_run=initiated_by_run, reference_type=reference_type, ) diff --git a/lamindb/_save.py b/lamindb/_save.py index 622e0a533..fd7fb4d76 100644 --- a/lamindb/_save.py +++ b/lamindb/_save.py @@ -63,7 +63,7 @@ def save(records: Iterable[Record], ignore_conflicts: bool | None = False) -> No Update a single existing record: >>> transform = ln.Transform.get("0Cb86EZj") - >>> transform.name = "New name" + >>> transform.description = "New description" >>> transform.save() """ diff --git a/lamindb/_transform.py b/lamindb/_transform.py index 8c9ce142e..2bdaca8a3 100644 --- a/lamindb/_transform.py +++ b/lamindb/_transform.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING from lamin_utils import logger @@ -20,8 +21,10 @@ def __init__(transform: Transform, *args, **kwargs): if len(args) == len(transform._meta.concrete_fields): super(Transform, transform).__init__(*args, **kwargs) return None - name: str | None = kwargs.pop("name") if "name" in kwargs else None key: str | None = kwargs.pop("key") if "key" in kwargs else None + description: str | None = ( + kwargs.pop("description") if "description" in kwargs else None + ) revises: Transform | None = kwargs.pop("revises") if "revises" in kwargs else None version: str | None = kwargs.pop("version") if "version" in kwargs else None type: TransformType | None = kwargs.pop("type") if "type" in kwargs else "pipeline" @@ -29,14 +32,27 @@ def __init__(transform: Transform, *args, **kwargs): reference_type: str | None = ( kwargs.pop("reference_type") if "reference_type" in kwargs else None ) - if "is_new_version_of" in kwargs: - logger.warning("`is_new_version_of` will be removed soon, please use `revises`") - revises = kwargs.pop("is_new_version_of") + if "name" in kwargs: + if key is None: + key = kwargs.pop("name") + warnings.warn( + f"`name` will be removed soon, please pass '{key}' to `key` instead", + FutureWarning, + stacklevel=2, + ) + else: + # description wasn't exist, so no check necessary + description = kwargs.pop("name") + warnings.warn( + f"`name` will be removed soon, please pass '{description}' to `description` instead", + FutureWarning, + stacklevel=2, + ) # below is internal use that we'll hopefully be able to eliminate uid: str | None = kwargs.pop("uid") if "uid" in kwargs else None if not len(kwargs) == 0: raise ValueError( - "Only name, key, version, type, revises, reference, " + "Only key, description, version, type, revises, reference, " f"reference_type can be passed, but you passed: {kwargs}" ) if revises is None: @@ -48,28 +64,37 @@ def __init__(transform: Transform, *args, **kwargs): .first() ) elif key is not None: - revises = ( + candidate_for_revises = ( Transform.filter(key=key, is_latest=True) .order_by("-created_at") .first() ) + if candidate_for_revises is not None: + revises = candidate_for_revises + if candidate_for_revises.source_code is None: + # no source code was yet saved, return the same transform + uid = revises.uid if revises is not None and uid is not None and uid == revises.uid: from ._record import init_self_from_db, update_attributes + if revises.key != key: + logger.warning("ignoring inconsistent key") init_self_from_db(transform, revises) - update_attributes(transform, {"name": name}) + update_attributes(transform, {"description": description}) return None if revises is not None and key is not None and revises.key != key: note = message_update_key_in_version_family( suid=revises.stem_uid, existing_key=revises.key, new_key=key, - registry="Artifact", + registry="Transform", ) raise InconsistentKey( - f"`key` is {key}, but `revises.key` is '{revises.key}'\n\nEither do *not* pass `key`.\n\n{note}" + f"`key` is '{key}', but `revises.key` is '{revises.key}'\n\nEither do *not* pass `key`.\n\n{note}" ) - new_uid, version, name, revises = process_revises(revises, version, name, Transform) + new_uid, version, key, description, revises = process_revises( + revises, version, key, description, Transform + ) # this is only because the user-facing constructor allows passing a uid # most others don't if uid is None: @@ -79,7 +104,7 @@ def __init__(transform: Transform, *args, **kwargs): has_consciously_provided_uid = True super(Transform, transform).__init__( uid=uid, - name=name, + description=description, key=key, type=type, version=version, @@ -91,13 +116,6 @@ def __init__(transform: Transform, *args, **kwargs): def delete(self) -> None: - _source_code_artifact = None - if self._source_code_artifact is not None: - _source_code_artifact = self._source_code_artifact - self._source_code_artifact = None - self.save() - if _source_code_artifact is not None: - _source_code_artifact.delete(permanent=True) # query all runs and delete their artifacts runs = Run.filter(transform=self) for run in runs: @@ -117,10 +135,10 @@ def latest_run(self) -> Run: def view_lineage(self, with_successors: bool = False, distance: int = 5): return _view_parents( record=self, - field="name", + field="key", with_children=with_successors, distance=distance, - attr_name="predecessors", + attr_description="predecessors", ) diff --git a/lamindb/base/__init__.py b/lamindb/base/__init__.py index e69de29bb..e0f82caf3 100644 --- a/lamindb/base/__init__.py +++ b/lamindb/base/__init__.py @@ -0,0 +1,14 @@ +"""Base library. + +Is available also when no instance is connected. + +Modules: + +.. autosummary:: + :toctree: . + + types + +""" + +from . import types diff --git a/lamindb/base/types.py b/lamindb/base/types.py index e47bdc562..adc5e74ca 100644 --- a/lamindb/base/types.py +++ b/lamindb/base/types.py @@ -1,11 +1,33 @@ +"""Types. + +Central object types. + +.. autosummary:: + :toctree: . + + ArtifactKind + TransformType + FeatureDtype + +Basic types. + +.. autosummary:: + :toctree: . + + UPathStr + StrField + ListLike + FieldAttr +""" + from __future__ import annotations from typing import Literal, Union import numpy as np import pandas as pd -from django.db.models import IntegerChoices # needed elsewhere from django.db.models.query_utils import DeferredAttribute as FieldAttr +from lamindb_setup.core.types import UPathStr # noqa: F401 # need to use Union because __future__.annotations doesn't do the job here <3.10 # typing.TypeAlias, >3.10 on but already deprecated @@ -13,7 +35,7 @@ StrField = Union[str, FieldAttr] # typing.TypeAlias TransformType = Literal["pipeline", "notebook", "upload", "script", "function", "glue"] -ArtifactType = Literal["dataset", "model"] +ArtifactKind = Literal["dataset", "model"] FeatureDtype = Literal[ "cat", # categorical variables "num", # numerical variables @@ -25,9 +47,3 @@ "datetime", # datetime variables "object", # this is a pandas type, we're only using it for complicated types, not for strings ] - - -class VisibilityChoice(IntegerChoices): - default = 1 - hidden = 0 - trash = -1 diff --git a/lamindb/core/__init__.py b/lamindb/core/__init__.py index 4d5bd6424..06dc63afb 100644 --- a/lamindb/core/__init__.py +++ b/lamindb/core/__init__.py @@ -6,6 +6,7 @@ :toctree: . Record + BasicRecord Registry QuerySet QueryManager @@ -59,7 +60,6 @@ loaders datasets storage - types exceptions subsettings logger @@ -82,6 +82,7 @@ SOMACurator, ) from lamindb.models import ( + BasicRecord, CanCurate, FeatureValue, HasParents, diff --git a/lamindb/core/_context.py b/lamindb/core/_context.py index a8f6a6367..e2ef83897 100644 --- a/lamindb/core/_context.py +++ b/lamindb/core/_context.py @@ -66,20 +66,21 @@ def get_notebook_path() -> Path: # from https://stackoverflow.com/questions/61901628 -def get_notebook_name_colab() -> str: +def get_notebook_key_colab() -> str: from socket import gethostbyname, gethostname # type: ignore from requests import get # type: ignore ip = gethostbyname(gethostname()) # 172.28.0.12 try: - name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113 + key = get(f"http://{ip}:9000/api/sessions").json()[0]["name"] # noqa: S113 + key = f"colab/{key}" except Exception: logger.warning( - "could not get notebook name from Google Colab, using: notebook.ipynb" + "could not get notebook key from Google Colab, using: colab/notebook.ipynb" ) - name = "notebook.ipynb" - return name.rstrip(".ipynb") + key = "colab/notebook.ipynb" + return key def pretty_pypackages(dependencies: dict) -> str: @@ -191,7 +192,7 @@ class Context: def __init__(self): self._uid: str | None = None - self._name: str | None = None + self._description: str | None = None self._version: str | None = None self._transform: Transform | None = None self._run: Run | None = None @@ -207,22 +208,31 @@ def transform(self) -> Transform | None: return self._transform @property - def uid(self) -> str | None: - """`uid` argument for `context.transform`.""" - return self._uid + def description(self) -> str | None: + """`description` argument for `context.transform`.""" + return self._description - @uid.setter - def uid(self, value: str | None): - self._uid = value + @description.setter + def description(self, value: str | None): + self._description = value @property def name(self) -> str | None: - """`name argument for `context.transform`.""" - return self._name + """Deprecated. Populates `description` argument for `context.transform`.""" + return self._description @name.setter def name(self, value: str | None): - self._name = value + self._description = value + + @property + def uid(self) -> str | None: + """`uid` argument for `context.transform`.""" + return self._uid + + @uid.setter + def uid(self, value: str | None): + self._uid = value @property def version(self) -> str | None: @@ -282,24 +292,23 @@ def track( transform = None self._path = None if transform is None: + description = None if is_run_from_ipython: - self._path, name = self._track_notebook(path_str=path) + self._path, description = self._track_notebook(path_str=path) transform_type = "notebook" transform_ref = None transform_ref_type = None else: ( self._path, - name, transform_type, transform_ref, transform_ref_type, ) = self._track_source_code(path=path) - # overwrite the parsed name - if self.name is not None: - name = self.name + if description is None: + description = self._description self._create_or_load_transform( - name=name, + description=description, transform_ref=transform_ref, transform_ref_type=transform_ref_type, transform_type=transform_type, @@ -369,7 +378,7 @@ def _track_source_code( self, *, path: UPathStr | None, - ) -> tuple[Path, str, str, str, str]: + ) -> tuple[Path, str, str, str]: # for `.py` files, classified as "script" # for `.Rmd` and `.qmd` files, which we classify # as "notebook" because they typically come with an .html run report @@ -387,31 +396,30 @@ def _track_source_code( else: path = Path(path) transform_type = "notebook" if path.suffix in {".Rmd", ".qmd"} else "script" - name = path.name reference = None reference_type = None if settings.sync_git_repo is not None: reference = get_transform_reference_from_git_repo(path) reference_type = "url" - return path, name, transform_type, reference, reference_type + return path, transform_type, reference, reference_type def _track_notebook( self, *, path_str: str | None, - ) -> tuple[Path, str]: + ) -> tuple[Path, str | None]: if path_str is None: path = get_notebook_path() else: path = Path(path_str) - name = path.stem + description = None path_str = path.as_posix() if path_str.endswith("Untitled.ipynb"): raise RuntimeError("Please rename your notebook before tracking it") if path_str.startswith("/fileId="): logger.warning("tracking on Google Colab is experimental") - name = get_notebook_name_colab() - path_str = f"{name}.ipynb" + path_str = get_notebook_key_colab() + path = Path(path_str) else: import nbproject @@ -421,7 +429,7 @@ def _track_notebook( # notebook is not saved pass if nbproject_title is not None: - name = nbproject_title + description = nbproject_title # log imported python packages try: from nbproject.dev._pypackage import infer_pypackages @@ -434,12 +442,12 @@ def _track_notebook( except Exception: logger.debug("inferring imported packages failed") pass - return path, name + return path, description def _create_or_load_transform( self, *, - name: str, + description: str, transform_ref: str | None = None, transform_ref_type: str | None = None, transform_type: TransformType = None, @@ -500,7 +508,7 @@ class SlashCount(Func): for transform in transforms ] ) - message = f"ignoring transform{plural_s} with same filename:\n{transforms_str}" + message = f"ignoring transform{plural_s} with same filedescription:\n{transforms_str}" if message != "": logger.important(message) self.uid, transform = uid, target_transform @@ -549,7 +557,7 @@ class SlashCount(Func): transform = Transform( uid=self.uid, version=self.version, - name=name, + description=description, key=key, reference=transform_ref, reference_type=transform_ref_type, @@ -569,25 +577,22 @@ class SlashCount(Func): else: uid = transform.uid # transform was already saved via `finish()` - transform_was_saved = ( - transform._source_code_artifact_id is not None - or transform.source_code is not None - ) + transform_was_saved = transform.source_code is not None # check whether the transform.key is consistent if transform.key != key: raise UpdateContext(get_key_clashing_message(transform, key)) - elif transform.name != name: - transform.name = name + elif transform.description != description: + transform.description = description transform.save() self._logging_message_track += ( - "updated transform name, " # white space on purpose + "updated transform description, " # white space on purpose ) elif ( transform.created_by_id != ln_setup.settings.user.id and not transform_was_saved ): raise UpdateContext( - f'{transform.created_by.name} ({transform.created_by.handle}) already works on this draft {transform.type}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* filename and `ln.track("{ids.base62_12()}0000")`.' + f'{transform.created_by.description} ({transform.created_by.handle}) already works on this draft {transform.type}.\n\nPlease create a revision via `ln.track("{uid[:-4]}{increment_base62(uid[-4:])}")` or a new transform with a *different* filedescription and `ln.track("{ids.base62_12()}0000")`.' ) # check whether transform source code was already saved if transform_was_saved: @@ -596,11 +601,7 @@ class SlashCount(Func): bump_revision = True else: hash, _ = hash_file(self._path) # ignore hash_type for now - if transform.hash is not None: - condition = hash != transform.hash - else: - condition = hash != transform._source_code_artifact.hash - if condition: + if hash != transform.hash: bump_revision = True else: self._logging_message_track += ( @@ -666,8 +667,10 @@ def finish(self, ignore_non_consecutive: None | bool = None) -> None: import nbproject # it might be that the user modifies the title just before ln.finish() - if (nbproject_title := nbproject.meta.live.title) != self.transform.name: - self.transform.name = nbproject_title + if ( + nbproject_title := nbproject.meta.live.title + ) != self.transform.description: + self.transform.description = nbproject_title self.transform.save() if get_seconds_since_modified(self._path) > 2 and not ln_setup._TESTING: raise NotebookNotSaved(get_save_notebook_message()) diff --git a/lamindb/core/_data.py b/lamindb/core/_data.py index cc7938f01..1f4fc271a 100644 --- a/lamindb/core/_data.py +++ b/lamindb/core/_data.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import defaultdict -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from django.db import connections from lamin_utils import colors, logger @@ -56,11 +56,6 @@ def get_run(run: Run | None) -> Run | None: return run -def add_transform_to_kwargs(kwargs: dict[str, Any], run: Run): - if run is not None: - kwargs["transform"] = run.transform - - def save_feature_sets(self: Artifact | Collection) -> None: if hasattr(self, "_feature_sets"): from lamindb.core._feature_manager import get_feature_set_by_slot_ diff --git a/lamindb/core/_describe.py b/lamindb/core/_describe.py index 11e6f5cd1..9e3a07b68 100644 --- a/lamindb/core/_describe.py +++ b/lamindb/core/_describe.py @@ -60,14 +60,14 @@ def describe_header(self: Artifact | Collection | Run) -> Tree: logger.warning( f"This is not the latest version of the {self.__class__.__name__}." ) - if hasattr(self, "visibility"): - if self.visibility == 0: + if hasattr(self, "_branch_code"): + if self._branch_code == 0: logger.warning("This artifact is hidden.") - elif self.visibility == -1: + elif self._branch_code == -1: logger.warning("This artifact is the trash.") # initialize tree suffix = self.suffix if hasattr(self, "suffix") and self.suffix else "" - accessor = self._accessor if hasattr(self, "_accessor") and self._accessor else "" + accessor = self.otype if hasattr(self, "otype") and self.otype else "" suffix_accessor = ( f"{suffix}/{accessor}" if suffix and accessor else suffix or accessor or "" ) @@ -89,15 +89,13 @@ def describe_general(self: Artifact | Collection, tree: Tree | None = None) -> T general = tree.add(Text("General", style="bold bright_cyan")) general.add(f".uid = '{self.uid}'") if hasattr(self, "key") and self.key: - general.add( - f".key = '{self.key}'" if self._key_is_virtual else f".key = {self.key}" - ) + general.add(f".key = '{self.key}'") if hasattr(self, "size") and self.size: general.add(f".size = {self.size}") if hasattr(self, "hash") and self.hash: general.add(f".hash = '{self.hash}'") - if hasattr(self, "n_objects") and self.n_objects: - general.add(f".n_objects = {self.n_objects}") + if hasattr(self, "n_files") and self.n_files: + general.add(f".n_files = {self.n_files}") if hasattr(self, "n_observations") and self.n_observations: general.add(Text(f".n_observations = {self.n_observations}")) if hasattr(self, "version") and self.version: @@ -131,7 +129,7 @@ def describe_general(self: Artifact | Collection, tree: Tree | None = None) -> T if hasattr(self, "transform") and self.transform: general.add( Text( - f".transform = '{self.transform.name}'", + f".transform = '{self.transform.description}'", style="cyan3", ) ) diff --git a/lamindb/core/_feature_manager.py b/lamindb/core/_feature_manager.py index 39a63da73..b5bc8b65a 100644 --- a/lamindb/core/_feature_manager.py +++ b/lamindb/core/_feature_manager.py @@ -832,11 +832,11 @@ def _add_values( model_name = "Param" if is_param else "Feature" if is_param: if self._host.__class__ == Artifact: - if self._host.type != "model": + if self._host.kind != "model": raise ValidationError("Can only set params for model-like artifacts.") else: if self._host.__class__ == Artifact: - if self._host.type != "dataset" and self._host.type is not None: + if self._host.kind != "dataset" and self._host.kind is not None: raise ValidationError( "Can only set features for dataset-like artifacts." ) @@ -1089,10 +1089,10 @@ def _add_set_from_df( ): """Add feature set corresponding to column names of DataFrame.""" if isinstance(self._host, Artifact): - assert self._host._accessor == "DataFrame" # noqa: S101 + assert self._host.otype == "DataFrame" # noqa: S101 else: # Collection - assert self._host.artifact._accessor == "DataFrame" # noqa: S101 + assert self._host.artifact.otype == "DataFrame" # noqa: S101 df = self._host.load() feature_set = FeatureSet.from_df( df=df, @@ -1113,7 +1113,7 @@ def _add_set_from_anndata( ): """Add features from AnnData.""" if isinstance(self._host, Artifact): - assert self._host._accessor == "AnnData" # noqa: S101 + assert self._host.otype == "AnnData" # noqa: S101 else: raise NotImplementedError() @@ -1143,7 +1143,7 @@ def _add_set_from_mudata( if obs_fields is None: obs_fields = {} if isinstance(self._host, Artifact): - assert self._host._accessor == "MuData" # noqa: S101 + assert self._host.otype == "MuData" # noqa: S101 else: raise NotImplementedError() diff --git a/lamindb/core/storage/_tiledbsoma.py b/lamindb/core/storage/_tiledbsoma.py index c97c4e363..e44565306 100644 --- a/lamindb/core/storage/_tiledbsoma.py +++ b/lamindb/core/storage/_tiledbsoma.py @@ -226,6 +226,6 @@ def save_tiledbsoma_experiment( _is_internal_call=True, ) artifact.n_observations = n_observations - artifact._accessor = "tiledbsoma" + artifact.otype = "tiledbsoma" return artifact.save() diff --git a/lamindb/core/storage/paths.py b/lamindb/core/storage/paths.py index 3c0b2b1d8..c2e43cf51 100644 --- a/lamindb/core/storage/paths.py +++ b/lamindb/core/storage/paths.py @@ -26,7 +26,7 @@ # add type annotations back asap when re-organizing the module def auto_storage_key_from_artifact(artifact: Artifact): if artifact.key is None or artifact._key_is_virtual: - is_dir = artifact.n_objects is not None + is_dir = artifact.n_files is not None return auto_storage_key_from_artifact_uid(artifact.uid, artifact.suffix, is_dir) else: return artifact.key diff --git a/lamindb/core/types.py b/lamindb/core/types.py index 560660d7f..236a5fc71 100644 --- a/lamindb/core/types.py +++ b/lamindb/core/types.py @@ -1,28 +1,6 @@ -"""Types. - -Central object types. - -.. autosummary:: - :toctree: . - - ArtifactType - TransformType - FeatureDtype - -Basic types. - -.. autosummary:: - :toctree: . - - UPathStr - StrField - ListLike -""" - from lamindb_setup.core.types import UPathStr from lamindb.base.types import ( - ArtifactType, FeatureDtype, FieldAttr, ListLike, diff --git a/lamindb/core/versioning.py b/lamindb/core/versioning.py index 0741c9cfe..1f98864b4 100644 --- a/lamindb/core/versioning.py +++ b/lamindb/core/versioning.py @@ -133,15 +133,18 @@ def get_new_path_from_uid(old_path: UPath, old_uid: str, new_uid: str): def process_revises( revises: IsVersioned | None, version: str | None, - name: str | None, + key: str | None, + description: str | None, type: type[IsVersioned], -) -> tuple[str, str, str, IsVersioned | None]: +) -> tuple[str, str, str, str, IsVersioned | None]: if revises is not None and not isinstance(revises, type): raise TypeError(f"`revises` has to be of type `{type.__name__}`") uid, revises = create_uid( revises=revises, version=version, n_full_id=type._len_full_uid ) if revises is not None: - if name is None: - name = revises.name - return uid, version, name, revises + if description is None: + description = revises.description + if key is None: + key = revises.key + return uid, version, key, description, revises diff --git a/lamindb/curators/__init__.py b/lamindb/curators/__init__.py index 8a278cbf5..2edb9dbf3 100644 --- a/lamindb/curators/__init__.py +++ b/lamindb/curators/__init__.py @@ -1539,7 +1539,7 @@ def save_artifact( run=run, ) artifact.n_observations = self._n_obs - artifact._accessor = "tiledbsoma" + artifact.otype = "tiledbsoma" artifact.save() else: artifact = self._artifact @@ -2125,13 +2125,13 @@ def save_artifact( organism, ) - if artifact._accessor == "DataFrame": + if artifact.otype == "DataFrame": artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) - elif artifact._accessor == "AnnData": + elif artifact.otype == "AnnData": artifact.features._add_set_from_anndata( var_field=columns_field, **feature_kwargs ) - elif artifact._accessor == "MuData": + elif artifact.otype == "MuData": artifact.features._add_set_from_mudata( var_fields=columns_field, **feature_kwargs ) @@ -2170,7 +2170,7 @@ def _add_labels( from_curator=True, ) - if artifact._accessor == "MuData": + if artifact.otype == "MuData": for modality, modality_fields in fields.items(): column_field_modality = columns_field.get(modality) if modality == "obs": diff --git a/lamindb/curators/_spatial.py b/lamindb/curators/_spatial.py index 284c5fc45..c734643af 100644 --- a/lamindb/curators/_spatial.py +++ b/lamindb/curators/_spatial.py @@ -388,7 +388,7 @@ def save_artifact( ) # According to Tim it is not easy to calculate the number of observations. # We would have to write custom code to iterate over labels (which might not even exist at that point) - self._artifact._accessor = "spatialdata" + self._artifact.otype = "spatialdata" self._artifact.save() # Link featuresets @@ -407,7 +407,7 @@ def _add_set_from_spatialdata( """Add FeatureSets from SpatialData.""" if obs_fields is None: obs_fields = {} - assert host._accessor == "spatialdata" # noqa: S101 + assert host.otype == "spatialdata" # noqa: S101 feature_sets = {} diff --git a/lamindb/integrations/_vitessce.py b/lamindb/integrations/_vitessce.py index 24ad4e65c..2c94d8138 100644 --- a/lamindb/integrations/_vitessce.py +++ b/lamindb/integrations/_vitessce.py @@ -60,7 +60,7 @@ def save_vitessce_config( # the below will be replaced with a `ln.tracked()` decorator soon transform = Transform( uid="kup03MJBsIVa0002", - name="save_vitessce_config", + key="save_vitessce_config", type="function", version="3", ).save() diff --git a/lamindb/migrations/0070_lamindbv1_migrate_data.py b/lamindb/migrations/0070_lamindbv1_migrate_data.py new file mode 100644 index 000000000..9c34c7370 --- /dev/null +++ b/lamindb/migrations/0070_lamindbv1_migrate_data.py @@ -0,0 +1,78 @@ +# Generated by Django 5.2 on 2025-01-05 11:58 + +from pathlib import Path + +import lamindb_setup as ln_setup +import psycopg2 +from django.db import migrations + + +def get_artifact_path_psycopg2(artifact_id): + """Get artifact path using psycopg2.""" + query = """ + SELECT + s.root || '/.lamindb/' || a.uid || a.suffix AS full_path + FROM + lamindb_artifact a + JOIN lamindb_storage s ON a.storage_id = s.id + WHERE + a.id = %s + """ + + with psycopg2.connect(ln_setup.settings.instance.db) as conn: + with conn.cursor() as cur: + cur.execute(query, (artifact_id,)) + return cur.fetchone()[0] + + +def transfer_source_code(apps, schema_editor): + from lamindb._finish import notebook_to_script + + Transform = apps.get_model("lamindb", "Transform") + transforms = Transform.objects.filter( + _source_code_artifact__isnull=False, + ).select_related("_source_code_artifact") + + for transform in transforms: + print(f"migrating source code of transform {transform}") + artifact = transform._source_code_artifact + print("artifact", artifact.uid) + + path_str = get_artifact_path_psycopg2(artifact.id) + print(ln_setup.settings.storage.root_as_str) + print(path_str) + if path_str.startswith(ln_setup.settings.storage.root_as_str): + path = ( + ln_setup.settings.storage.root + / f".lamindb/{artifact.uid}{artifact.suffix}" + ) + else: + path = ln_setup.core.upath.UPath(path_str) + if path.exists(): + if path_str.startswith("s3://"): + local_path = Path(f"temp{path.suffix}") + path.download_to(local_path) + else: + local_path = path + + if artifact.suffix == ".ipynb": + transform.source_code = notebook_to_script(transform, local_path) + else: + transform.source_code = local_path.read_text() + transform.hash = artifact.hash + path.unlink() + else: + print(f"path did not exist: {path_str}") + transform._source_code_artifact = None + transform.save() + artifact.delete() + + +class Migration(migrations.Migration): + dependencies = [ + ("lamindb", "0069_squashed"), + ] + + operations = [ + migrations.RunPython(transfer_source_code), + ] diff --git a/lamindb/migrations/0071_lamindbv1_migrate_schema.py b/lamindb/migrations/0071_lamindbv1_migrate_schema.py new file mode 100644 index 000000000..3ede0f717 --- /dev/null +++ b/lamindb/migrations/0071_lamindbv1_migrate_schema.py @@ -0,0 +1,741 @@ +# Generated by Django 5.2 on 2025-01-05 11:58 + +import django.db.models.deletion +from django.db import migrations, models +from lamindb_setup.core.hashing import hash_dict + +import lamindb.base.fields + + +def create_default_space(apps, schema_editor): + Space = apps.get_model("lamindb", "Space") + Space.objects.get_or_create( + name="All", + description="Every team & user with access to the instance has access.", + ) + + +def populate_hashes(apps, schema_editor): + ParamValue = apps.get_model("lamindb", "ParamValue") + + # Process all existing records + for param_value in ParamValue.objects.all(): + value = param_value.value + # Check if value is a dict or list (complex JSON) + if isinstance(value, dict): + value_hash = hash_dict(value) + param_value.hash = value_hash + param_value.save() + + +class Migration(migrations.Migration): + dependencies = [ + ("lamindb", "0070_lamindbv1_migrate_data"), + ] + + operations = [ + # create Space model + migrations.CreateModel( + name="Space", + fields=[ + ("id", models.SmallAutoField(primary_key=True, serialize=False)), + ("name", models.CharField(db_index=True, max_length=100)), + ( + "description", + lamindb.base.fields.CharField( + blank=True, default=None, max_length=255, null=True + ), + ), + ( + "created_at", + lamindb.base.fields.DateTimeField( + auto_now_add=True, + db_index=True, + default=django.utils.timezone.now, + ), + ), + ( + "created_by", + lamindb.base.fields.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="+", + to="lamindb.user", + ), + ), + ], + options={ + "abstract": False, + }, + ), + # populate the default space + migrations.RunPython(create_default_space), + # add space field to all models + migrations.AddField( + model_name="artifact", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="collection", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="feature", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="featureset", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="featurevalue", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="param", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="paramvalue", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="run", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="storage", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="transform", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="ulabel", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + migrations.AddField( + model_name="user", + name="space", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=1, + on_delete=django.db.models.deletion.PROTECT, + to="lamindb.space", + ), + ), + # changes to transform + migrations.RemoveField( + model_name="transform", + name="_source_code_artifact", + ), + # prepare removal of legacy description field + migrations.RunSQL( + sql=""" + UPDATE lamindb_transform + SET name = name || ' ' || description + WHERE description IS NOT NULL + AND description != ''; + """ + ), + migrations.RemoveField( + model_name="transform", + name="description", + ), + migrations.RenameField( + model_name="transform", + old_name="name", + new_name="description", + ), + migrations.AlterField( + model_name="transform", + name="key", + field=lamindb.base.fields.CharField( + blank=True, db_index=True, default=None, max_length=255, null=True + ), + ), + migrations.AlterField( + model_name="transform", + name="description", + field=lamindb.base.fields.CharField( + blank=True, db_index=True, default=None, max_length=255, null=True + ), + ), + migrations.RunSQL( + sql=""" + UPDATE lamindb_transform + SET key = description + WHERE key IS NULL and description IS NOT NULL; + """ + ), + # visibility -> _branch_code + migrations.RenameField( + model_name="collection", + old_name="name", + new_name="key", + ), + migrations.AlterField( + model_name="collection", + name="key", + field=lamindb.base.fields.CharField( + blank=True, db_index=True, default=None, max_length=255 + ), + ), + # visibility -> _branch_code + migrations.RenameField( + model_name="artifact", + old_name="visibility", + new_name="_branch_code", + ), + migrations.RenameField( + model_name="collection", + old_name="visibility", + new_name="_branch_code", + ), + migrations.AlterField( + model_name="artifact", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AlterField( + model_name="collection", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="feature", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="featureset", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="param", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="run", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="storage", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="transform", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="ulabel", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + migrations.AddField( + model_name="user", + name="_branch_code", + field=models.SmallIntegerField(db_index=True, default=1, db_default=1), + ), + # fix dtype values + migrations.RunSQL( + sql=""" + UPDATE lamindb_feature + SET dtype = 'num' + WHERE dtype = 'number' + """ + ), + migrations.RunSQL( + sql=""" + UPDATE lamindb_featureset + SET dtype = 'num' + WHERE dtype = 'number' + """ + ), + # an aux field on Record + migrations.AddField( + model_name="artifact", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="collection", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="feature", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="featureset", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="param", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="run", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="storage", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="transform", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="ulabel", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.AddField( + model_name="user", + name="aux", + field=models.JSONField(default=None, db_default=None, null=True), + ), + migrations.RenameField( + model_name="run", + old_name="is_consecutive", + new_name="_is_consecutive", + ), + migrations.AddField( + model_name="run", + name="_status_code", + field=models.SmallIntegerField(db_index=True, default=0), + ), + migrations.AddField( + model_name="transform", + name="_template", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="_derived_from", + to="lamindb.transform", + ), + ), + migrations.RenameField( + model_name="artifact", + old_name="type", + new_name="kind", + ), + migrations.RenameField( + model_name="artifact", + old_name="_accessor", + new_name="otype", + ), + migrations.AddField( + model_name="run", + name="_logfile", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="_logfile_of", + to="lamindb.artifact", + ), + ), + # unique constraint on hash + migrations.AddField( + model_name="featurevalue", + name="hash", + field=lamindb.base.fields.CharField( + blank=True, db_index=True, default=None, max_length=22, null=True + ), + ), + migrations.AddField( + model_name="paramvalue", + name="hash", + field=lamindb.base.fields.CharField( + blank=True, db_index=True, default=None, max_length=22, null=True + ), + ), + migrations.RunPython(populate_hashes), + migrations.AddConstraint( + model_name="featurevalue", + constraint=models.UniqueConstraint( + condition=models.Q(("hash__isnull", True)), + fields=("feature", "value"), + name="unique_simple_feature_value", + ), + ), + migrations.AddConstraint( + model_name="featurevalue", + constraint=models.UniqueConstraint( + condition=models.Q(("hash__isnull", False)), + fields=("feature", "hash"), + name="unique_complex_feature_value", + ), + ), + migrations.AddConstraint( + model_name="paramvalue", + constraint=models.UniqueConstraint( + condition=models.Q(("hash__isnull", True)), + fields=("param", "value"), + name="unique_simple_param_value", + ), + ), + migrations.AddConstraint( + model_name="paramvalue", + constraint=models.UniqueConstraint( + condition=models.Q(("hash__isnull", False)), + fields=("param", "hash"), + name="unique_complex_param_value", + ), + ), + # add _curator field + migrations.AddField( + model_name="artifact", + name="_curator", + field=models.JSONField(default=None, db_default=None, null=True), + ), + # add _expect_many fields + migrations.AddField( + model_name="feature", + name="_expect_many", + field=models.BooleanField(default=True, db_default=True), + ), + migrations.AddField( + model_name="param", + name="_expect_many", + field=models.BooleanField(default=False, db_default=False), + ), + # remove transform field + migrations.RemoveField( + model_name="artifact", + name="transform", + ), + migrations.RemoveField( + model_name="collection", + name="transform", + ), + # richer link tables + migrations.CreateModel( + name="TransformULabel", + fields=[ + ( + "created_at", + lamindb.base.fields.DateTimeField(auto_now_add=True, db_index=True), + ), + ("id", models.BigAutoField(primary_key=True, serialize=False)), + ( + "created_by", + lamindb.base.fields.ForeignKey( + blank=True, + default=lamindb.base.users.current_user_id, + on_delete=django.db.models.deletion.PROTECT, + related_name="+", + to="lamindb.user", + ), + ), + ( + "run", + lamindb.base.fields.ForeignKey( + blank=True, + default=lamindb.models.current_run, + null=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="+", + to="lamindb.run", + ), + ), + ( + "transform", + lamindb.base.fields.ForeignKey( + blank=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="links_ulabel", + to="lamindb.transform", + ), + ), + ( + "ulabel", + lamindb.base.fields.ForeignKey( + blank=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="links_transform", + to="lamindb.ulabel", + ), + ), + ], + options={ + "unique_together": {("transform", "ulabel")}, + }, + bases=(lamindb.models.LinkORM, models.Model), + ), + migrations.RunSQL( + sql=""" + INSERT INTO lamindb_transformulabel (transform_id, ulabel_id, created_at, created_by_id) + SELECT + transform_id, + ulabel_id, + CURRENT_TIMESTAMP, -- Sets current timestamp for created_at + 1 -- Sets default user ID 1 for created_by_id + FROM lamindb_transform_ulabels; + """ + ), + migrations.RemoveField( + model_name="transform", + name="ulabels", + ), + migrations.AddField( + model_name="transform", + name="ulabels", + field=models.ManyToManyField( + to="lamindb.ulabel", + through="lamindb.TransformULabel", + related_name="transforms", + ), + ), + migrations.AddField( + model_name="artifactparamvalue", + name="created_at", + field=lamindb.base.fields.DateTimeField( + auto_now_add=True, db_index=True, default=django.utils.timezone.now + ), + preserve_default=False, + ), + migrations.AddField( + model_name="artifactparamvalue", + name="created_by", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=lamindb.base.users.current_user_id, + on_delete=django.db.models.deletion.PROTECT, + related_name="+", + to="lamindb.user", + ), + ), + migrations.AddField( + model_name="artifactparamvalue", + name="run", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=lamindb.models.current_run, + null=True, + on_delete=django.db.models.deletion.PROTECT, + related_name="+", + to="lamindb.run", + ), + ), + migrations.AddField( + model_name="runparamvalue", + name="created_at", + field=lamindb.base.fields.DateTimeField( + auto_now_add=True, db_index=True, default=django.utils.timezone.now + ), + preserve_default=False, + ), + migrations.AddField( + model_name="runparamvalue", + name="created_by", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=lamindb.base.users.current_user_id, + on_delete=django.db.models.deletion.PROTECT, + related_name="+", + to="lamindb.user", + ), + ), + migrations.AddField( + model_name="ulabel", + name="is_concept", + field=lamindb.base.fields.BooleanField( + blank=True, + default=False, + db_default=False, + ), + ), + migrations.RenameField( + model_name="run", + old_name="parent", + new_name="initiated_by_run", + ), + migrations.AlterField( + model_name="run", + name="initiated_by_run", + field=lamindb.base.fields.ForeignKey( + blank=True, + default=None, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="initiated_runs", + to="lamindb.run", + ), + ), + # create and set _overwrite_versions + migrations.AddField( + model_name="artifact", + name="_overwrite_versions", + field=lamindb.base.fields.BooleanField(blank=True, default=None, null=True), + ), + migrations.RunSQL( + sql=""" + UPDATE lamindb_artifact + SET _overwrite_versions = CASE + WHEN n_objects IS NOT NULL THEN TRUE + ELSE FALSE + END; + """ + ), + # rename n_files to n_objects + migrations.RenameField( + model_name="artifact", + old_name="n_objects", + new_name="n_files", + ), + # let feature value and paramvalue inherit from Record + migrations.AddField( + model_name="featurevalue", + name="_branch_code", + field=models.SmallIntegerField(db_default=1, db_index=True, default=1), + ), + migrations.AddField( + model_name="featurevalue", + name="aux", + field=models.JSONField(db_default=None, default=None, null=True), + ), + migrations.AddField( + model_name="paramvalue", + name="_branch_code", + field=models.SmallIntegerField(db_default=1, db_index=True, default=1), + ), + migrations.AddField( + model_name="paramvalue", + name="aux", + field=models.JSONField(db_default=None, default=None, null=True), + ), + migrations.AddField( + model_name="run", + name="name", + field=lamindb.base.fields.CharField( + blank=True, default=None, max_length=150, null=True + ), + ), + migrations.AlterField( + model_name="space", + name="created_at", + field=lamindb.base.fields.DateTimeField(auto_now_add=True, db_index=True), + ), + migrations.AddField( + model_name="param", + name="type", + field=lamindb.base.fields.CharField( + blank=True, db_index=True, default=None, max_length=100, null=True + ), + ), + migrations.AddField( + model_name="feature", + name="type", + field=lamindb.base.fields.CharField( + blank=True, db_index=True, default=None, max_length=100, null=True + ), + ), + migrations.RemoveField( + model_name="feature", + name="_previous_runs", + ), + migrations.RemoveField( + model_name="param", + name="_previous_runs", + ), + migrations.RemoveField( + model_name="storage", + name="_previous_runs", + ), + migrations.RemoveField( + model_name="ulabel", + name="_previous_runs", + ), + migrations.AlterField( + model_name="artifact", + name="_overwrite_versions", + field=lamindb.base.fields.BooleanField(blank=True, default=None), + ), + ] diff --git a/lamindb/models.py b/lamindb/models.py index 3e4e0c4da..7da7c876e 100644 --- a/lamindb/models.py +++ b/lamindb/models.py @@ -14,8 +14,8 @@ overload, ) -from django.db import models -from django.db.models import CASCADE, PROTECT, Field +from django.db import IntegrityError, models +from django.db.models import CASCADE, PROTECT, Field, Q from django.db.models.base import ModelBase from django.db.models.fields.related import ( ManyToManyField, @@ -25,7 +25,7 @@ from lamin_utils import colors from lamindb_setup import _check_instance_setup from lamindb_setup.core._docs import doc_args -from lamindb_setup.core.hashing import HASH_LENGTH +from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict from lamindb.base.fields import ( BigIntegerField, @@ -40,13 +40,12 @@ from .base.ids import base62_8, base62_12, base62_20 from .base.types import ( - ArtifactType, + ArtifactKind, FeatureDtype, FieldAttr, ListLike, StrField, TransformType, - VisibilityChoice, ) from .base.users import current_user_id @@ -199,10 +198,6 @@ class Meta: updated_at: datetime = DateTimeField(auto_now=True, db_index=True) """Time of last update to record.""" - # no default related_name below because it'd clash with the reverse accessor - # of the .run field - _previous_runs: Run = models.ManyToManyField("lamindb.Run", related_name="+") - """Sequence of runs that created or updated the record.""" @overload def __init__(self): ... @@ -786,9 +781,38 @@ def __get_name_with_schema__(cls) -> str: return f"{schema_prefix}{cls.__name__}" +class BasicRecord(models.Model, metaclass=Registry): + """Basic metadata record. + + It has the same methods as Record, but doesn't have the additional fields. + + It's mainly used for LinkORMs and similar. + """ + + class Meta: + abstract = True + + +class Space(BasicRecord): + """Spaces.""" + + id: int = models.SmallAutoField(primary_key=True) + """Internal id, valid only in one DB instance.""" + name: str = models.CharField(max_length=100, db_index=True) + """Name of space.""" + description: str | None = CharField(null=True) + """Description of space.""" + created_at: datetime = DateTimeField(auto_now_add=True, db_index=True) + """Time of creation of record.""" + created_by: User = ForeignKey( + "User", CASCADE, default=None, related_name="+", null=True + ) + """Creator of run.""" + + @doc_args(RECORD_REGISTRY_EXAMPLE) -class Record(models.Model, metaclass=Registry): - """Base class for metadata records. +class Record(BasicRecord, metaclass=Registry): + """Metadata record. Every `Record` is a data model that comes with a registry in form of a SQL table in your database. @@ -805,6 +829,28 @@ class Record(models.Model, metaclass=Registry): machine learning or biological models. """ + _branch_code: int = models.SmallIntegerField(db_index=True, default=1, db_default=1) + """Whether record is on a branch, in archive or in trash. + + This dictates whether a record appears in queries & searches. + + Coding is as follows: + + - 3: template (hidden in queries & searches) + - 2: draft (hidden in queries & searches) + - 1: default (visible in queries & searches) + - 0: archive (hidden, meant to be kept) + - -1: trash (hidden, scheduled for deletion) + + Any integer higher than >3 codes a branch that's involved in a pull request. + """ + space: Space = ForeignKey(Space, PROTECT, default=1) + """The space in which the record lives.""" + aux: dict[str, Any] | None = models.JSONField( + default=None, db_default=None, null=True + ) + """Auxiliary field for dictionary-like metadata.""" + def save(self, *args, **kwargs) -> Record: """Save. @@ -1051,6 +1097,8 @@ def path(self) -> Path | UPath: pass +# does not inherit from TracksRun because the Transform +# is needed to define a run class Transform(Record, IsVersioned): """Data transformations. @@ -1081,7 +1129,7 @@ class Transform(Record, IsVersioned): Args: name: `str` A name or title. key: `str | None = None` A short name or path-like semantic key. - type: `TransformType | None = "pipeline"` See :class:`~lamindb.core.types.TransformType`. + type: `TransformType | None = "pipeline"` See :class:`~lamindb.base.types.TransformType`. revises: `Transform | None = None` An old version of the transform. See Also: @@ -1117,32 +1165,25 @@ class Meta(Record.Meta, IsVersioned.Meta): _len_stem_uid: int = 12 _len_full_uid: int = 16 - _name_field: str = "name" + _name_field: str = "key" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" uid: str = CharField(unique=True, db_index=True, max_length=_len_full_uid) """Universal id.""" - name: str | None = CharField(max_length=150, db_index=True, null=True) - """A name or title. For instance, a pipeline name, notebook title, etc.""" - key: str | None = CharField(max_length=120, db_index=True, null=True) - """A key for concise reference & versioning (optional).""" - description: str | None = CharField(max_length=255, null=True) - """A description (optional).""" + key: str | None = CharField(db_index=True, null=True) + """A name or "/"-separated path-like string. + + All transforms with the same key are part of the same version family. + """ + description: str | None = CharField(db_index=True, null=True) + """A description.""" type: TransformType = CharField( max_length=20, db_index=True, default="pipeline", ) - """:class:`~lamindb.core.types.TransformType` (default `"pipeline"`).""" - _source_code_artifact: Artifact | None = ForeignKey( - "Artifact", PROTECT, null=True, related_name="_source_code_of", default=None - ) - """Source code of the transform if stored as artifact within LaminDB. - - .. versionchanged:: 0.75 - Made private and deprecated for future removal. - """ + """:class:`~lamindb.base.types.TransformType` (default `"pipeline"`).""" source_code: str | None = TextField(null=True) """Source code of the transform. @@ -1150,12 +1191,16 @@ class Meta(Record.Meta, IsVersioned.Meta): The `source_code` field is no longer an artifact, but a text field. """ hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True) + """Hash of the source code.""" reference: str | None = CharField(max_length=255, db_index=True, null=True) - """Reference for the transform, e.g.. URL.""" + """Reference for the transform, e.g., a URL.""" reference_type: str | None = CharField(max_length=25, db_index=True, null=True) + """Reference type of the transform, e.g., 'url'.""" runs: Run """Runs of this transform.""" - ulabels: ULabel = models.ManyToManyField("ULabel", related_name="transforms") + ulabels: ULabel = models.ManyToManyField( + "ULabel", through="TransformULabel", related_name="transforms" + ) """ULabel annotations of this transform.""" predecessors: Transform = models.ManyToManyField( "self", symmetrical=False, related_name="successors" @@ -1165,8 +1210,10 @@ class Meta(Record.Meta, IsVersioned.Meta): These are auto-populated whenever an artifact or collection serves as a run input, e.g., `artifact.run` and `artifact.transform` get populated & saved. - The table provides a convenience method to query for the predecessors that - bypassed querying the :class:`~lamindb.Run`. + The table provides a more convenient method to query for the predecessors that + bypasses querying the :class:`~lamindb.Run`. + + It also allows to manually add predecessors whose outputs are not tracked in a run. """ successors: Transform """Subsequent transforms. @@ -1191,6 +1238,10 @@ class Meta(Record.Meta, IsVersioned.Meta): User, PROTECT, default=current_user_id, related_name="created_transforms" ) """Creator of record.""" + _template: Transform | None = ForeignKey( + "Transform", PROTECT, related_name="_derived_from", default=None, null=True + ) + """Creating template.""" @overload def __init__( @@ -1214,6 +1265,14 @@ def __init__( ): super().__init__(*args, **kwargs) + @property + def name(self) -> str: + """Name of the transform. + + Splits `key` on `/` and returns the last element. + """ + return self.key.split("/")[-1] + @property def latest_run(self) -> Run: """The latest run of this transform.""" @@ -1239,14 +1298,29 @@ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta): For categorical types, can define from which registry values are sampled, e.g., `cat[ULabel]` or `cat[bionty.CellType]`. """ + type = CharField(max_length=100, null=True, blank=True, db_index=True) + """Param type - a free form type (e.g., 'pipeline', 'model_training', 'post_processing').""" + _expect_many: bool = models.BooleanField(default=False, db_default=False) + """Indicates whether values for this param are expected to occur a single or multiple times for an artifact/run (default `False`). + - if it's `False` (default), the values mean artifact/run-level values and a dtype of `datetime` means `datetime` + - if it's `True`, the values are from an aggregation, which this seems like an edge case but when characterizing a model ensemble trained with different parameters it could be relevant + """ # backward fields values: ParamValue """Values for this parameter.""" +# FeatureValue behaves in many ways like a link in a LinkORM +# in particular, we don't want a _public field on it +# Also, we don't inherit from TracksRun because a ParamValue +# is typically created before a run is created and we want to +# avoid delete cycles (for Model params though it might be helpful) class ParamValue(Record): - """Parameters with values akin to FeatureValue.""" + """Parameter values. + + Is largely analogous to `FeatureValue`. + """ # we do not have a unique constraint on param & value because it leads to hashing errors # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000 @@ -1273,6 +1347,40 @@ class ParamValue(Record): User, PROTECT, default=current_user_id, related_name="+" ) """Creator of record.""" + hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True) + + class Meta: + constraints = [ + # For simple types, use direct value comparison + models.UniqueConstraint( + fields=["param", "value"], + name="unique_simple_param_value", + condition=Q(hash__isnull=True), + ), + # For complex types (dictionaries), use hash + models.UniqueConstraint( + fields=["param", "hash"], + name="unique_complex_param_value", + condition=Q(hash__isnull=False), + ), + ] + + @classmethod + def get_or_create(cls, param, value): + # Simple types: int, float, str, bool + if isinstance(value, (int, float, str, bool)): + try: + return cls.objects.create(param=param, value=value, hash=None), False + except IntegrityError: + return cls.objects.get(param=param, value=value), True + + # Complex types: dict, list + else: + hash = hash_dict(value) + try: + return cls.objects.create(param=param, value=value, hash=hash), False + except IntegrityError: + return cls.objects.get(param=param, hash=hash), True class Run(Record): @@ -1330,6 +1438,8 @@ class Run(Record): """Internal id, valid only in one DB instance.""" uid: str = CharField(unique=True, db_index=True, max_length=20, default=base62_20) """Universal id, valid across DB instances.""" + name: str | None = CharField(max_length=150, null=True) + """A name.""" transform = ForeignKey(Transform, CASCADE, related_name="runs") """The transform :class:`~lamindb.Transform` that is being run.""" started_at: datetime = DateTimeField(auto_now_add=True, db_index=True) @@ -1342,6 +1452,10 @@ class Run(Record): "Artifact", PROTECT, null=True, related_name="_report_of", default=None ) """Report of run, e.g.. n html file.""" + _logfile: Artifact | None = ForeignKey( + "Artifact", PROTECT, null=True, related_name="_logfile_of", default=None + ) + """Report of run, e.g.. n html file.""" environment: Artifact | None = ForeignKey( "Artifact", PROTECT, null=True, related_name="_environment_of", default=None ) @@ -1363,8 +1477,6 @@ class Run(Record): """The collections serving as input for this run.""" output_collections: Collection """The collections generated by this run.""" - is_consecutive: bool | None = BooleanField(null=True) - """Indicates whether code was consecutively executed. Is relevant for notebooks.""" _param_values: ParamValue = models.ManyToManyField( ParamValue, through="RunParamValue", related_name="runs" ) @@ -1379,8 +1491,8 @@ class Run(Record): User, CASCADE, default=current_user_id, related_name="created_runs" ) """Creator of run.""" - parent: Run | None = ForeignKey( - "Run", CASCADE, null=True, related_name="children", default=None + initiated_by_run: Run | None = ForeignKey( + "Run", CASCADE, null=True, related_name="initiated_runs", default=None ) """The run that triggered the current run. @@ -1392,6 +1504,17 @@ class Run(Record): """ children: Run """The runs that are triggered by this run.""" + _is_consecutive: bool | None = BooleanField(null=True) + """Indicates whether code was consecutively executed. Is relevant for notebooks.""" + _status_code: int = models.SmallIntegerField(default=0, db_index=True) + """Status code of the run. + + - 0: scheduled + - 1: started + - 2: errored + - 3: aborted + - 4: completed + """ @overload def __init__( @@ -1484,6 +1607,8 @@ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta): """A universal random id, valid across DB instances.""" name: str = CharField(max_length=150, db_index=True, unique=True) """Name or title of ulabel (`unique=True`).""" + is_concept: bool = BooleanField(default=False, db_default=False) + """Distinguish mere ontological parents from labels that are meant to be used for labeling; for instance, you would never want to label an artifact with a ulabel Project, you'll only want to label with actual project values Project 1, Project 2, etc.""" description: str | None = TextField(null=True) """A description (optional).""" reference: str | None = CharField(max_length=255, db_index=True, null=True) @@ -1545,7 +1670,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates): Args: name: `str` Name of the feature, typically. column name. - dtype: `FeatureDtype | Registry | list[Registry]` See :class:`~lamindb.core.types.FeatureDtype`. + dtype: `FeatureDtype | Registry | list[Registry]` See :class:`~lamindb.base.types.FeatureDtype`. For categorical types, can define from which registry values are sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`. unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc. @@ -1623,12 +1748,14 @@ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta): name: str = CharField(max_length=150, db_index=True, unique=True) """Name of feature (`unique=True`).""" dtype: FeatureDtype = CharField(max_length=64, db_index=True) - """Data type (:class:`~lamindb.core.types.FeatureDtype`). + """Data type (:class:`~lamindb.base.types.FeatureDtype`). For categorical types, can define from which registry values are sampled, e.g., `'cat[ULabel]'` or `'cat[bionty.CellType]'`. Unions are also allowed if the feature samples from two registries, e.g., `'cat[ULabel|bionty.CellType]'` """ + type = CharField(max_length=100, null=True, blank=True, db_index=True) + """Feature type - a free form type (e.g., 'readout', 'metric', 'metadata', 'expert_annotation', 'model_prediction').""" unit: str | None = CharField(max_length=30, db_index=True, null=True) """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional).""" description: str | None = TextField(db_index=True, null=True) @@ -1642,7 +1769,12 @@ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta): "FeatureSet", through="FeatureSetFeature", related_name="features" ) """Feature sets linked to this feature.""" + _expect_many: bool = models.BooleanField(default=True, db_default=True) + """Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `True`). + - if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level mean `set[datetime]` on the artifact-level + - if it's `False` it's an artifact-level value and datetime means datetime; this is an edge case because an arbitrary artifact would always be a set of arbitrary measurements that would need to be aggregated ("one just happens to measure a single cell line in that artifact") + """ # backward fields values: FeatureValue """Values for this feature.""" @@ -1697,9 +1829,6 @@ class FeatureValue(Record, TracksRun): # there does not seem an issue with querying for a dict-like value # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001 - class Meta(Record.Meta, TracksRun.Meta): - abstract = False - _name_field: str = "value" feature: Feature | None = ForeignKey( @@ -1708,6 +1837,45 @@ class Meta(Record.Meta, TracksRun.Meta): """The dimension metadata.""" value: Any = models.JSONField() """The JSON-like value.""" + hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True) + """Value hash.""" + + class Meta(BasicRecord.Meta, TracksRun.Meta): + constraints = [ + # For simple types, use direct value comparison + models.UniqueConstraint( + fields=["feature", "value"], + name="unique_simple_feature_value", + condition=Q(hash__isnull=True), + ), + # For complex types (dictionaries), use hash + models.UniqueConstraint( + fields=["feature", "hash"], + name="unique_complex_feature_value", + condition=Q(hash__isnull=False), + ), + ] + + @classmethod + def get_or_create(cls, feature, value): + # Simple types: int, float, str, bool + if isinstance(value, (int, float, str, bool)): + try: + return cls.objects.create( + feature=feature, value=value, hash=None + ), False + except IntegrityError: + return cls.objects.get(feature=feature, value=value), True + + # Complex types: dict, list + else: + hash = hash_dict(value) + try: + return cls.objects.create( + feature=feature, value=value, hash=hash + ), False + except IntegrityError: + return cls.objects.get(feature=feature, hash=hash), True class FeatureSet(Record, CanCurate, TracksRun): @@ -1791,7 +1959,7 @@ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta): uid: str = CharField(unique=True, db_index=True, max_length=20) """A universal id (hash of the set of feature values).""" name: str | None = CharField(max_length=150, null=True) - """A name (optional).""" + """A name.""" n = IntegerField() """Number of features in the set.""" dtype: str | None = CharField(max_length=64, null=True) @@ -1800,7 +1968,7 @@ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta): For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level. """ registry: str = CharField(max_length=120, db_index=True) - """The registry that stores the feature identifiers, e.g., `'core.Feature'` or `'bionty.Gene'`. + """The registry that stores the feature identifiers, e.g., `'Feature'` or `'bionty.Gene'`. Depending on the registry, `.members` stores, e.g. `Feature` or `Gene` records. """ @@ -1996,7 +2164,6 @@ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta): _len_full_uid: int = 20 _len_stem_uid: int = 16 - _name_field: str = "description" params: ParamManager = ParamManagerArtifact # type: ignore """Param manager. @@ -2095,12 +2262,14 @@ def labels(self) -> LabelManager: This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "". """ - type: ArtifactType | None = CharField( + kind: ArtifactKind | None = CharField( max_length=20, db_index=True, null=True, ) - """:class:`~lamindb.core.types.ArtifactType` (default `None`).""" + """:class:`~lamindb.base.types.ArtifactKind` (default `None`).""" + otype: str | None = CharField(max_length=64, db_index=True, null=True) + """Default Python object type, e.g., DataFrame, AnnData.""" size: int | None = BigIntegerField(null=True, db_index=True, default=None) """Size in bytes. @@ -2111,10 +2280,10 @@ def labels(self) -> LabelManager: Useful to ascertain integrity and avoid duplication. """ - n_objects: int | None = BigIntegerField(null=True, db_index=True, default=None) - """Number of objects. + n_files: int | None = BigIntegerField(null=True, db_index=True, default=None) + """Number of files for folder-like artifacts, `None` for file-like artifacts. - Typically, this denotes the number of files in an artifact. + Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`. """ n_observations: int | None = BigIntegerField(null=True, db_index=True, default=None) """Number of observations. @@ -2123,16 +2292,10 @@ def labels(self) -> LabelManager: """ _hash_type: str | None = CharField(max_length=30, db_index=True, null=True) """Type of hash.""" - _accessor: str | None = CharField(max_length=64, db_index=True, null=True) - """Default backed or memory accessor, e.g., DataFrame, AnnData.""" ulabels: ULabel = models.ManyToManyField( ULabel, through="ArtifactULabel", related_name="artifacts" ) """The ulabels measured in the artifact (:class:`~lamindb.ULabel`).""" - transform: Transform | None = ForeignKey( - Transform, PROTECT, related_name="output_artifacts", null=True, default=None - ) - """Transform whose run created the artifact.""" run: Run | None = ForeignKey( Run, PROTECT, related_name="output_artifacts", null=True, default=None ) @@ -2159,10 +2322,6 @@ def labels(self) -> LabelManager: ParamValue, through="ArtifactParamValue", related_name="artifacts" ) """Parameter values.""" - visibility: int = models.SmallIntegerField( - db_index=True, choices=VisibilityChoice.choices, default=1 - ) - """Visibility of artifact record in queries & searches (1 default, 0 hidden, -1 trash).""" _key_is_virtual: bool = BooleanField() """Indicates whether `key` is virtual or part of an actual file path.""" # be mindful that below, passing related_name="+" leads to errors @@ -2177,6 +2336,14 @@ def labels(self) -> LabelManager: related_name="created_artifacts", ) """Creator of record.""" + _curator: dict[str, str] | None = models.JSONField( + default=None, db_default=None, null=True + ) + _overwrite_versions: bool = BooleanField(default=None) + """Indicates whether to store or overwrite versions. + + It defaults to False for file-like artifacts and to True for folder-like artifacts. + """ @overload def __init__( @@ -2189,7 +2356,7 @@ def __init__( # here; and we might refactor this but we might also keep that internal # usage data: UPathStr, - type: ArtifactType | None = None, + type: ArtifactKind | None = None, key: str | None = None, description: str | None = None, revises: Artifact | None = None, @@ -2209,6 +2376,11 @@ def __init__( ): pass + @property + def transform(self) -> Transform | None: + """Transform whose run created the artifact.""" + return self.run.transform if self.run is not None else None + @property def path(self) -> Path: """Path. @@ -2480,7 +2652,7 @@ def delete( ) -> None: """Trash or permanently delete. - A first call to `.delete()` puts an artifact into the trash (sets `visibility` to `-1`). + A first call to `.delete()` puts an artifact into the trash (sets `_branch_code` to `-1`). A second call permanently deletes the artifact. FAQ: :doc:`docs:faq/storage` @@ -2529,10 +2701,6 @@ def describe(self) -> None: pass -# auto-generated through choices() -delattr(Artifact, "get_visibility_display") - - class Collection(Record, IsVersioned, TracksRun, TracksUpdates): """Collections of artifacts. @@ -2568,7 +2736,7 @@ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta): _len_full_uid: int = 20 _len_stem_uid: int = 16 - _name_field: str = "name" + _name_field: str = "key" id: int = models.AutoField(primary_key=True) """Internal id, valid only in one DB instance.""" @@ -2576,10 +2744,10 @@ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta): unique=True, db_index=True, max_length=_len_full_uid, default=base62_20 ) """Universal id, valid across DB instances.""" - name: str = CharField(max_length=150, db_index=True) - """Name or title of collection (required).""" + key: str = CharField(db_index=True) + """Name or path-like key.""" description: str | None = TextField(null=True) - """A description.""" + """A description or title.""" hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True) """Hash of collection content. 86 base64 chars allow to store 64 bytes, 512 bits.""" reference: str | None = CharField(max_length=255, db_index=True, null=True) @@ -2591,10 +2759,6 @@ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta): "ULabel", through="CollectionULabel", related_name="collections" ) """ULabels sampled in the collection (see :class:`~lamindb.Feature`).""" - transform: Transform | None = ForeignKey( - Transform, PROTECT, related_name="output_collections", null=True, default=None - ) - """:class:`~lamindb.Transform` whose run created the collection.""" run: Run | None = ForeignKey( Run, PROTECT, related_name="output_collections", null=True, default=None ) @@ -2622,10 +2786,6 @@ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta): collection from the artifact via a private field: `artifact._meta_of_collection`. """ - visibility: int = models.SmallIntegerField( - db_index=True, choices=VisibilityChoice.choices, default=1 - ) - """Visibility of collection record in queries & searches (1 default, 0 hidden, -1 trash).""" _actions: Artifact = models.ManyToManyField(Artifact, related_name="+") """Actions to attach for the UI.""" @@ -2795,6 +2955,19 @@ def restore(self) -> None: """ pass + @property + def transform(self) -> Transform | None: + """Transform whose run created the collection.""" + return self.run.transform if self.run is not None else None + + @property + def name(self) -> str: + """Name of the collection. + + Splits `key` on `/` and returns the last element. + """ + return self.key.split("/")[-1] + @property def ordered_artifacts(self) -> QuerySet: """Ordered `QuerySet` of `.artifacts`. @@ -2828,10 +3001,6 @@ def describe(self) -> None: pass -# auto-generated through choices() -delattr(Collection, "get_visibility_display") - - # ------------------------------------------------------------------------------------- # Link models @@ -2844,7 +3013,7 @@ class ValidateFields: pass -class FeatureSetFeature(Record, LinkORM): +class FeatureSetFeature(BasicRecord, LinkORM): id: int = models.BigAutoField(primary_key=True) # we follow the lower() case convention rather than snake case for link models featureset: FeatureSet = ForeignKey(FeatureSet, CASCADE, related_name="+") @@ -2854,7 +3023,7 @@ class Meta: unique_together = ("featureset", "feature") -class ArtifactFeatureSet(Record, LinkORM, TracksRun): +class ArtifactFeatureSet(BasicRecord, LinkORM, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_feature_set") # we follow the lower() case convention rather than snake case for link models @@ -2870,7 +3039,7 @@ class Meta: unique_together = ("artifact", "featureset") -class CollectionArtifact(Record, LinkORM, TracksRun): +class CollectionArtifact(BasicRecord, LinkORM, TracksRun): id: int = models.BigAutoField(primary_key=True) collection: Collection = ForeignKey( Collection, CASCADE, related_name="links_artifact" @@ -2881,7 +3050,7 @@ class Meta: unique_together = ("collection", "artifact") -class ArtifactULabel(Record, LinkORM, TracksRun): +class ArtifactULabel(BasicRecord, LinkORM, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="links_ulabel") ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_artifact") @@ -2897,7 +3066,16 @@ class Meta: unique_together = ("artifact", "ulabel", "feature") -class CollectionULabel(Record, LinkORM, TracksRun): +class TransformULabel(BasicRecord, LinkORM, TracksRun): + id: int = models.BigAutoField(primary_key=True) + transform: Transform = ForeignKey(Transform, CASCADE, related_name="links_ulabel") + ulabel: ULabel = ForeignKey(ULabel, PROTECT, related_name="links_transform") + + class Meta: + unique_together = ("transform", "ulabel") + + +class CollectionULabel(BasicRecord, LinkORM, TracksRun): id: int = models.BigAutoField(primary_key=True) collection: Collection = ForeignKey( Collection, CASCADE, related_name="links_ulabel" @@ -2913,7 +3091,7 @@ class Meta: unique_together = ("collection", "ulabel") -class ArtifactFeatureValue(Record, LinkORM, TracksRun): +class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+") # we follow the lower() case convention rather than snake case for link models @@ -2923,17 +3101,23 @@ class Meta: unique_together = ("artifact", "featurevalue") -class RunParamValue(Record, LinkORM): +class RunParamValue(BasicRecord, LinkORM): id: int = models.BigAutoField(primary_key=True) run: Run = ForeignKey(Run, CASCADE, related_name="+") # we follow the lower() case convention rather than snake case for link models paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+") + created_at: datetime = DateTimeField(auto_now_add=True, db_index=True) + """Time of creation of record.""" + created_by: User = ForeignKey( + "lamindb.User", PROTECT, default=current_user_id, related_name="+" + ) + """Creator of record.""" class Meta: unique_together = ("run", "paramvalue") -class ArtifactParamValue(Record, LinkORM): +class ArtifactParamValue(BasicRecord, LinkORM, TracksRun): id: int = models.BigAutoField(primary_key=True) artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+") # we follow the lower() case convention rather than snake case for link models diff --git a/noxfile.py b/noxfile.py index 2113f9442..e065f65f8 100644 --- a/noxfile.py +++ b/noxfile.py @@ -150,8 +150,11 @@ def install_ci(session, group): # installing this after lamindb to be sure that these packages won't be reinstaled # during lamindb installation if IS_PR or group == "docs": - cmd = "uv pip install --system --no-deps ./sub/lamindb-setup ./sub/lamin-cli" - run(session, cmd) + run( + session, + "uv pip install --system --no-deps ./sub/lamindb-setup ./sub/lamin-cli ./sub/ourprojects", + ) + run(session, "uv pip uninstall --system lnschema-core") if "bionty" in extras: run( session, diff --git a/pyproject.toml b/pyproject.toml index d3c2f327a..747caea4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,12 +4,11 @@ build-backend = "flit_core.buildapi" [project] name = "lamindb" -requires-python = ">=3.9,<3.13" +requires-python = ">=3.10,<3.13" authors = [{name = "Lamin Labs", email = "open-source@lamin.ai"}] readme = "README.md" dynamic = ["version", "description"] classifiers = [ - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -17,9 +16,9 @@ classifiers = [ dependencies = [ # Lamin PINNED packages "lamin_utils==0.13.10", - "lamin_cli==0.22.0", - "lamindb_setup[aws]==0.81.4", - "ourprojects==0.1.0", + "lamin_cli==1.0a1", + "lamindb_setup[aws]==1.0a1", + "ourprojects==1.0a1", # others "pyarrow", "typing_extensions!=4.6.0", @@ -37,7 +36,7 @@ Home = "https://github.com/laminlabs/lamindb" [project.optional-dependencies] bionty = [ - "bionty==0.53.2", + "bionty==1.0a1", ] gcp = [ "lamindb_setup[gcp]", @@ -58,10 +57,7 @@ erdiagram = [ "django-schema-graph", ] wetlab = [ - "wetlab" -] -findrefs = [ - "findrefs" + "wetlab==1.0a1" ] clinicore = [ "clinicore" @@ -83,7 +79,7 @@ dev = [ "pytest-cov", "mudata", # others - "nbproject_test>=0.5.1", + "nbproject_test>=0.6.0", # biology "faker-biology", ] @@ -178,7 +174,13 @@ lint.ignore = [ # Prefer absolute imports over relative imports from parent modules "TID252", # Standard pseudo-random generators are not suitable for cryptographic purposes - "S311" + "S311", + # All of the below TODO 3.10 refactor, temporarily disable + "UP007", + "UP038", + "B905", + "UP035", + "RUF100", ] [tool.ruff.lint.pydocstyle] diff --git a/sub/bionty b/sub/bionty index 79fce9c0c..e1db124c1 160000 --- a/sub/bionty +++ b/sub/bionty @@ -1 +1 @@ -Subproject commit 79fce9c0ca8b26474cd6a19b7e786db4f8f6e971 +Subproject commit e1db124c1deab84b583463c4e6005ad849e7866b diff --git a/sub/clinicore b/sub/clinicore index 6c0955546..687e9d2e4 160000 --- a/sub/clinicore +++ b/sub/clinicore @@ -1 +1 @@ -Subproject commit 6c0955546bc0b0d04b89a9f515b2f519ed3dadeb +Subproject commit 687e9d2e48d5442e8660efa141be3ed671669d22 diff --git a/sub/lamin-cli b/sub/lamin-cli index d68f2101d..c005ed444 160000 --- a/sub/lamin-cli +++ b/sub/lamin-cli @@ -1 +1 @@ -Subproject commit d68f2101d0bb917bce14a3721b4c4c0588138b59 +Subproject commit c005ed444b2e8f236c2f7dfd0469921d85d6e670 diff --git a/sub/lamindb-setup b/sub/lamindb-setup index ebd93eb1b..60d357fd9 160000 --- a/sub/lamindb-setup +++ b/sub/lamindb-setup @@ -1 +1 @@ -Subproject commit ebd93eb1b91c6d46266985d08e10bd4fb8c0d8c0 +Subproject commit 60d357fd920b4cc0d6ddc508165363236df13202 diff --git a/sub/ourprojects b/sub/ourprojects index b3074be2e..0b5b9e96b 160000 --- a/sub/ourprojects +++ b/sub/ourprojects @@ -1 +1 @@ -Subproject commit b3074be2e4c3a9299c54335c36e1f80a49dbd5a7 +Subproject commit 0b5b9e96b80075adeba764f835373d9967a4b807 diff --git a/sub/wetlab b/sub/wetlab index d69b7c9f4..f2220cf84 160000 --- a/sub/wetlab +++ b/sub/wetlab @@ -1 +1 @@ -Subproject commit d69b7c9f48bea02ad9ef0d964a567e4bc8bd6e70 +Subproject commit f2220cf84b479608a7e3e9fab650d638b120a0af diff --git a/tests/core/notebooks/no-title.ipynb b/tests/core/notebooks/no-title.ipynb index 7ad71b073..d73e583d6 100644 --- a/tests/core/notebooks/no-title.ipynb +++ b/tests/core/notebooks/no-title.ipynb @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "assert ln.context.transform.name == \"no-title\"\n", + "assert ln.context.transform.description is None\n", "assert ln.context.transform.key == \"no-title.ipynb\"" ] } diff --git a/tests/core/test_artifact.py b/tests/core/test_artifact.py index 5bf56fa84..a696c8f7d 100644 --- a/tests/core/test_artifact.py +++ b/tests/core/test_artifact.py @@ -193,9 +193,7 @@ def test_revise_artifact(df, adata): assert error.exconly() == "ValueError: Please increment the previous version: '1'" # create new file from old file - artifact_r2 = ln.Artifact.from_anndata( - adata, is_new_version_of=artifact - ) # backward compat + artifact_r2 = ln.Artifact.from_anndata(adata, revises=artifact) assert artifact_r2.stem_uid == artifact.stem_uid assert artifact_r2.uid.endswith("0001") artifact_r2 = ln.Artifact.from_anndata(adata, revises=artifact) @@ -274,8 +272,8 @@ def test_create_from_dataframe(df): artifact = ln.Artifact.from_df(df, description="test1") assert artifact.description == "test1" assert artifact.key is None - assert artifact._accessor == "DataFrame" - assert artifact.type == "dataset" + assert artifact.otype == "DataFrame" + assert artifact.kind == "dataset" assert hasattr(artifact, "_local_filepath") artifact.save() # can do backed now, tested in test_storage.py @@ -297,7 +295,7 @@ def test_create_from_dataframe_using_from_df_and_link_features(df): # backward compatibility for ln.Artifact to take a DataFrame artifact = ln.Artifact(df, key="folder/hello.parquet", description=description) assert artifact.description == description - assert artifact._accessor == "DataFrame" + assert artifact.otype == "DataFrame" assert artifact.key == "folder/hello.parquet" assert artifact._key_is_virtual assert artifact.uid in artifact.path.as_posix() @@ -324,7 +322,7 @@ def test_create_from_anndata_in_memory_and_link_features(adata): ) ln.save(ln.Feature.from_df(adata.obs)) artifact = ln.Artifact.from_anndata(adata, description="test") - assert artifact._accessor == "AnnData" + assert artifact.otype == "AnnData" assert hasattr(artifact, "_local_filepath") artifact.save() # check that the local filepath has been cleared @@ -345,7 +343,7 @@ def test_create_from_anndata_in_memory_and_link_features(adata): def test_create_from_anndata_strpath(adata_file): artifact = ln.Artifact.from_anndata(adata_file, description="test adata file") artifact.save() - assert artifact._accessor == "AnnData" + assert artifact.otype == "AnnData" artifact.delete(permanent=True, storage=True) @@ -359,13 +357,12 @@ def test_create_from_anndata_in_storage(data): artifact = ln.Artifact.from_anndata( data, description="test_create_from_anndata" ) - assert artifact._accessor == "AnnData" + assert artifact.otype == "AnnData" assert hasattr(artifact, "_local_filepath") else: previous_storage = ln.setup.settings.storage.root_as_str ln.settings.storage = "s3://lamindb-test/core" filepath = data - # TODO: automatically add accessor based on file suffix artifact = ln.Artifact(filepath) artifact.save() # check that the local filepath has been cleared @@ -422,7 +419,7 @@ def test_create_from_local_filepath( else artifact.description == description ) assert artifact.suffix == suffix - assert artifact.n_objects is None + assert artifact.n_files is None artifact.save() assert artifact.path.exists() @@ -506,7 +503,7 @@ def test_from_dir_many_artifacts(get_test_filepaths, key): def test_delete_artifact(df): artifact = ln.Artifact.from_df(df, description="My test file to delete") artifact.save() - assert artifact.visibility == 1 + assert artifact._branch_code == 1 assert artifact.key is None or artifact._key_is_virtual storage_path = artifact.path # trash behavior @@ -514,13 +511,13 @@ def test_delete_artifact(df): assert storage_path.exists() assert ln.Artifact.filter(description="My test file to delete").first() is None assert ln.Artifact.filter( - description="My test file to delete", visibility=-1 + description="My test file to delete", _branch_code=-1 ).first() # permanent delete artifact.delete(permanent=True) assert ( ln.Artifact.filter( - description="My test file to delete", visibility=None + description="My test file to delete", _branch_code=None ).first() is None ) @@ -544,7 +541,7 @@ def test_delete_artifact(df): assert ( ln.Artifact.filter( description="My test file to delete from non-default storage", - visibility=None, + _branch_code=None, ).first() is None ) @@ -777,8 +774,8 @@ def callback(*args, **kwargs): write_adata_zarr(adata, zarr_path, callback) artifact = ln.Artifact(zarr_path, key="test_adata.zarr") - assert artifact._accessor == "AnnData" - assert artifact.n_objects >= 1 + assert artifact.otype == "AnnData" + assert artifact.n_files >= 1 artifact.save() assert isinstance(artifact.path, CloudPath) @@ -802,9 +799,9 @@ def callback(*args, **kwargs): # test zarr from memory artifact = ln.Artifact(adata, key="test_adata.anndata.zarr") assert artifact._local_filepath.is_dir() - assert artifact._accessor == "AnnData" + assert artifact.otype == "AnnData" assert artifact.suffix == ".anndata.zarr" - assert artifact.n_objects >= 1 + assert artifact.n_files >= 1 artifact.save() assert isinstance(artifact.path, CloudPath) @@ -862,61 +859,42 @@ def test_adata_suffix(adata): def test_bulk_delete(): report_path = Path("report.html") - with open(report_path, "w") as f: - f.write("a") - _source_code_artifact_path = Path("code.py") - with open(_source_code_artifact_path, "w") as f: - f.write("b") + report_path.write_text("a") environment_path = Path("environment.txt") - with open(environment_path, "w") as f: - f.write("c") - report = ln.Artifact(report_path, description="Report") - report.save() + environment_path.write_text("c") + report = ln.Artifact(report_path, description="Report").save() report_path.unlink() report_path = report.path - _source_code_artifact = ln.Artifact( - _source_code_artifact_path, description="Source" - ) - _source_code_artifact.save() - _source_code_artifact_path.unlink() - _source_code_artifact_path = _source_code_artifact.path - environment = ln.Artifact(environment_path, description="requirement.txt") - environment.save() + environment = ln.Artifact(environment_path, description="requirement.txt").save() environment_path.unlink() environment_path = environment.path - ln.Artifact.filter( - id__in=[_source_code_artifact.id, environment.id, report.id] - ).delete() + ln.Artifact.filter(id__in=[environment.id, report.id]).delete() + assert len(ln.Artifact.filter(id__in=[environment.id, report.id]).all()) == 0 + + # the 2 artifacts are in trash now assert ( len( ln.Artifact.filter( - id__in=[_source_code_artifact.id, environment.id, report.id] - ).all() - ) - == 0 - ) - assert ( - len( - ln.Artifact.filter( - id__in=[_source_code_artifact.id, environment.id, report.id], - visibility=-1, + id__in=[environment.id, report.id], + _branch_code=-1, ) .distinct() .all() ) - == 3 + == 2 ) - ln.Artifact.filter( - id__in=[_source_code_artifact.id, environment.id, report.id], visibility=-1 - ).delete(permanent=True) + ln.Artifact.filter(id__in=[environment.id, report.id], _branch_code=-1).delete( + permanent=True + ) + # now they're gone assert ( len( ln.Artifact.filter( - id__in=[_source_code_artifact.id, environment.id, report.id], - visibility=None, + id__in=[environment.id, report.id], + _branch_code=None, ) .distinct() .all() @@ -925,7 +903,6 @@ def test_bulk_delete(): ) assert not report_path.exists() - assert not _source_code_artifact_path.exists() assert not environment_path.exists() @@ -958,7 +935,7 @@ def test_gcp_paths(): "gs://rxrx1-europe-west4/images/test/HEPG2-08", description="Test GCP folder" ).save() assert artifact_folder.hash == "6r5Hkce0UTy7X6gLeaqzBA" - assert artifact_folder.n_objects == 14772 + assert artifact_folder.n_files == 14772 artifact_file = ln.Artifact( "gs://rxrx1-europe-west4/images/test/HEPG2-08/Plate1/B02_s1_w1.png", diff --git a/tests/core/test_artifact_folders.py b/tests/core/test_artifact_folders.py index 98db8ee0e..85a424c58 100644 --- a/tests/core/test_artifact_folders.py +++ b/tests/core/test_artifact_folders.py @@ -26,7 +26,7 @@ def test_folder_like_artifact(get_test_filepaths, key): ) return None artifact1 = ln.Artifact(test_dirpath, key=key) - assert artifact1.n_objects == 3 + assert artifact1.n_files == 3 assert artifact1.hash == hash_test_dir assert artifact1._state.adding assert artifact1.description is None @@ -48,7 +48,7 @@ def test_folder_like_artifact(get_test_filepaths, key): test_filepath_added = test_dirpath / "my_file_added.txt" test_filepath_added.write_text("2") artifact3 = ln.Artifact(test_dirpath, key=key, revises=artifact1) - assert artifact3.n_objects == 4 + assert artifact3.n_files == 4 assert artifact3.hash != hash_test_dir assert artifact3._state.adding assert artifact3.description is None @@ -70,5 +70,5 @@ def test_folder_like_artifact_s3(): study0_data = ln.Artifact("s3://lamindata/iris_studies/study0_raw_images") assert study0_data.hash == "IVKGMfNwi8zKvnpaD_gG7w" assert study0_data._hash_type == "md5-d" - assert study0_data.n_objects == 51 + assert study0_data.n_files == 51 assert study0_data.size == 658465 diff --git a/tests/core/test_collection.py b/tests/core/test_collection.py index 717e4c173..37f832dc0 100644 --- a/tests/core/test_collection.py +++ b/tests/core/test_collection.py @@ -91,12 +91,12 @@ def test_edge_cases(df): with pytest.raises(ValueError) as error: ln.Collection(df, invalid_param=1) assert str(error.exconly()).startswith( - "ValueError: Only artifacts, name, run, description, reference, reference_type, visibility can be passed, you passed: " + "ValueError: Only artifacts, key, run, description, reference, reference_type can be passed, you passed: " ) with pytest.raises(ValueError) as error: ln.Collection(1, name="Invalid") assert str(error.exconly()).startswith( - "ValueError: Artifact or List[Artifact] is allowed." + "ValueError: Artifact or list[Artifact] is allowed." ) artifact = ln.Artifact.from_df(df, description="Test artifact") assert artifact._state.adding @@ -116,12 +116,9 @@ def test_edge_cases(df): def test_from_inconsistent_artifacts(df, adata): - artifact1 = ln.Artifact.from_df(df, description="My test") - artifact1.save() - artifact2 = ln.Artifact.from_anndata(adata, description="My test2") - artifact2.save() - collection = ln.Collection([artifact1, artifact2], name="Inconsistent") - collection.save() + artifact1 = ln.Artifact.from_df(df, description="My test").save() + artifact2 = ln.Artifact.from_anndata(adata, description="My test2").save() + collection = ln.Collection([artifact1, artifact2], name="Inconsistent").save() # test idempotency of .save() collection.save() # create a run context @@ -174,7 +171,7 @@ def test_from_consistent_artifacts(adata, adata2): collection2 = ln.Collection([artifact1, artifact2], name="My test 1", run=run) assert not collection2._state.adding assert collection2.id == collection.id - assert collection2.name == "My test 1" + assert collection2.key == "My test 1" collection.delete(permanent=True) artifact1.delete(permanent=True) @@ -379,15 +376,13 @@ def test_collection_mapped(adata, adata2): def test_revise_collection(df, adata): # create a versioned collection - artifact = ln.Artifact.from_df(df, description="test") - artifact.save() + artifact = ln.Artifact.from_df(df, description="test").save() collection = ln.Collection(artifact, name="test", version="1") assert collection.version == "1" assert collection.uid.endswith("0000") collection.save() - artifact = ln.Artifact.from_anndata(adata, description="test") - artifact.save() + artifact = ln.Artifact.from_anndata(adata, description="test").save() with pytest.raises(ValueError) as error: collection_r2 = ln.Collection(artifact, revises=collection, version="1") @@ -397,14 +392,14 @@ def test_revise_collection(df, adata): ln.Collection(adata, revises="wrong-type") # create new collection from old collection - collection_r2 = ln.Collection(artifact, is_new_version_of=collection) + collection_r2 = ln.Collection(artifact, revises=collection) assert collection_r2.stem_uid == collection.stem_uid assert collection_r2.uid.endswith("0001") collection_r2 = ln.Collection(artifact, revises=collection) assert collection_r2.stem_uid == collection.stem_uid assert collection_r2.uid.endswith("0001") assert collection_r2.version is None - assert collection_r2.name == "test" + assert collection_r2.key == "test" collection_r2.save() @@ -418,7 +413,7 @@ def test_revise_collection(df, adata): assert collection_r3.stem_uid == collection.stem_uid assert collection_r3.version == "2" assert collection_r3.uid.endswith("0002") - assert collection_r3.name == "test1" + assert collection_r3.key == "test1" artifacts_r2 = collection_r2.artifacts.all() collection_r2.delete(permanent=True) @@ -429,13 +424,9 @@ def test_revise_collection(df, adata): def test_collection_append(df, adata): - artifact = ln.Artifact.from_df(df, description="test") - artifact.save() - artifact_1 = ln.Artifact.from_anndata(adata, description="test") - artifact_1.save() - - col = ln.Collection(artifact, name="Test", description="Test append") - col.save() + artifact = ln.Artifact.from_df(df, description="test").save() + artifact_1 = ln.Artifact.from_anndata(adata, description="test").save() + col = ln.Collection(artifact, name="Test", description="Test append").save() col_append = col.append(artifact_1).save() diff --git a/tests/core/test_context.py b/tests/core/test_context.py index 7c3952c6c..eb2a118f1 100644 --- a/tests/core/test_context.py +++ b/tests/core/test_context.py @@ -101,26 +101,26 @@ def test_create_or_load_transform(): context.version = version context._path = Path("my-test-transform-create-or-load.py") context._create_or_load_transform( - name=title, + description=title, transform_type="notebook", ) assert context._transform.uid == uid assert context._transform.version == version - assert context._transform.name == title + assert context._transform.description == title context._create_or_load_transform( - name=title, + description=title, ) assert context._transform.uid == uid assert context._transform.version == version - assert context._transform.name == title + assert context._transform.description == title # now, test an updated transform name context._create_or_load_transform( - name="updated title", + description="updated title", ) assert context._transform.uid == uid assert context._transform.version == version - assert context._transform.name == "updated title" + assert context._transform.description == "updated title" def test_run_scripts(): @@ -250,12 +250,11 @@ def test_run_external_script(): "https://github.com/laminlabs/lamin-cli/blob/" ) assert transform.reference_type == "url" - assert transform.name == "My good script" + assert transform.description == "My good script" # ensure that the source code is not saved as an output artifact assert transform.latest_run.output_artifacts.count() == 0 assert transform.runs.count() == 1 assert transform.hash == "MoIciBQ0lpVPCKQGofPX6g" - assert transform._source_code_artifact is None @pytest.mark.parametrize("type", ["notebook", "script"]) diff --git a/tests/core/test_data.py b/tests/core/test_data.py index 63ac62e0f..230d13e6d 100644 --- a/tests/core/test_data.py +++ b/tests/core/test_data.py @@ -1,15 +1,5 @@ import lamindb as ln import pytest -from lamindb.core._data import add_transform_to_kwargs - - -def test_add_transform_to_kwargs(): - kwargs = {} - transform = ln.Transform(name="hello") - transform.save() - run = ln.Run(transform) - add_transform_to_kwargs(kwargs, run) - assert kwargs["transform"] == transform def test_rename(): diff --git a/tests/core/test_describe_df.py b/tests/core/test_describe_and_df_calls.py similarity index 99% rename from tests/core/test_describe_df.py rename to tests/core/test_describe_and_df_calls.py index e4c46cc88..2c78bd530 100644 --- a/tests/core/test_describe_df.py +++ b/tests/core/test_describe_and_df_calls.py @@ -100,6 +100,7 @@ def test_curate_df(): .df(include=["feature_sets__hash", "feature_sets__name"]) .drop(["uid"], axis=1) ) + print(df) expected_data = { "key": ["example_datasets/dataset2.h5ad", "example_datasets/dataset1.h5ad"], "description": [None, None], diff --git a/tests/core/test_models.py b/tests/core/test_models.py index 9ed5f8a7e..4ce03b100 100644 --- a/tests/core/test_models.py +++ b/tests/core/test_models.py @@ -17,11 +17,14 @@ def test_registry__repr__param(): Simple fields .name: CharField .dtype: CharField + .type: CharField .created_at: DateTimeField .updated_at: DateTimeField + .aux: JSONField Relational fields .created_by: User .run: Run + .space: Space .values: ParamValue """).strip() @@ -41,19 +44,20 @@ def test_registry__repr__artifact(): .key: CharField .description: CharField .suffix: CharField - .type: CharField + .kind: CharField + .otype: CharField .size: BigIntegerField .hash: CharField - .n_objects: BigIntegerField + .n_files: BigIntegerField .n_observations: BigIntegerField - .visibility: SmallIntegerField .version: CharField .is_latest: BooleanField .created_at: DateTimeField .updated_at: DateTimeField + .aux: JSONField Relational fields + .space: Space .storage: Storage - .transform: Transform .run: Run .created_by: User .ulabels: ULabel diff --git a/tests/core/test_record.py b/tests/core/test_record.py index ba2add9ff..be5fe162e 100644 --- a/tests/core/test_record.py +++ b/tests/core/test_record.py @@ -165,16 +165,15 @@ def test_suggest_similar_names(): def test_pass_version(): - # creating a new transform on key bumps the version uid - # hence we'll get an error if we don't also increment the semantic version - ln.Transform(key="mytransform", version="1").save() + # creating a new transform on key retrieves the same transform + # for as long as no source_code was saved + transform = ln.Transform(key="mytransform", version="1").save() + assert ln.Transform(key="mytransform", version="1") == transform + # in case source code is saved + transform.source_code = "dummy" + transform.save() with pytest.raises(ValueError, match="Please increment the previous version"): ln.Transform(key="mytransform", version="1") - # creating a new transform on name retrieves the same transform - # upon re-naming to description, this will be unintuitive, but I fear - # we need it nonetheless to maintain backward-compat - transform = ln.Transform(name="mytransform", version="1").save() - assert ln.Transform(name="mytransform", version="1") == transform def test_get_name_field(): @@ -187,19 +186,24 @@ def test_get_name_field(): def test_using(): + # the two below calls error if the records aren't found ln.Artifact.using("laminlabs/lamin-site-assets").get(1) ln.Artifact.using("laminlabs/lamin-site-assets").get(uid="MqEaGU7fXvxNy61R0000") # cross-database query - cell_types = bt.CellType.using("laminlabs/lamindata").lookup() - assert ( - ln.Artifact.using("laminlabs/lamindata") - .filter(cell_types=cell_types.t_cell) + hemangioblast = bt.CellType.from_source(name="hemangioblast").save() + artifact = ( + ln.Artifact.using("laminlabs/lamin-dev") + .filter(cell_types=hemangioblast) .first() - is not None ) - assert ( - ln.Artifact.using("laminlabs/lamindata") - .filter(cell_types__in=[cell_types.t_cell]) + assert artifact is not None + hemangioblast_dev = artifact.cell_types.get(name="hemangioblast") + assert hemangioblast_dev.uid == hemangioblast.uid + assert hemangioblast_dev.id != hemangioblast.id + # query via list + artifact_ref = ( + ln.Artifact.using("laminlabs/lamin-dev") + .filter(cell_types__in=[hemangioblast]) .first() - is not None ) + assert artifact == artifact_ref diff --git a/tests/core/test_transform.py b/tests/core/test_transform.py index 827051adf..95310382b 100644 --- a/tests/core/test_transform.py +++ b/tests/core/test_transform.py @@ -25,13 +25,20 @@ def test_revise_transforms(): # try to reload the same transform with the same uid transform_reload = ln.Transform(uid=transform.uid, name="My transform updated name") assert transform_reload.id == transform.id - assert transform_reload.name == "My transform updated name" + assert transform_reload.key == "My transform" # unchanged, prints logging + transform_reload = ln.Transform( + uid=transform.uid, description="My transform updated name" + ) + assert transform_reload.id == transform.id + assert ( + transform_reload.description == "My transform updated name" + ) # unchanged, prints logging # create new transform from old transform - transform_r2 = ln.Transform(name="My 2nd transform", is_new_version_of=transform) + transform_r2 = ln.Transform(description="My 2nd transform", revises=transform) assert transform_r2.uid != transform.uid assert transform_r2.uid.endswith("0001") - transform_r2 = ln.Transform(name="My 2nd transform", revises=transform) + transform_r2 = ln.Transform(description="My 2nd transform", revises=transform) assert transform_r2.uid != transform.uid assert transform_r2.uid.endswith("0001") assert transform_r2.stem_uid == transform.stem_uid @@ -42,20 +49,29 @@ def test_revise_transforms(): assert not transform.is_latest # create new transform from newly versioned transform - transform_r3 = ln.Transform(name="My transform", revises=transform_r2, version="2") + transform_r3 = ln.Transform( + description="My transform", revises=transform_r2, version="2" + ) assert transform_r3.stem_uid == transform.stem_uid assert transform_r3.version == "2" - # default name + # default description transform_r3 = ln.Transform(revises=transform_r2) - assert transform_r3.name == transform_r2.name + assert transform_r3.description == transform_r2.description # revise by matching on `key` key = "my-notebook.ipynb" transform_r2.key = key transform_r2.save() assert transform_r2.is_latest - transform_r3 = ln.Transform(name="My transform", key=key, version="2") + transform_r3 = ln.Transform(description="My transform", key=key, version="2") + assert transform_r3.uid[:-4] == transform_r2.uid[:-4] + assert transform_r3.uid.endswith("0001") + # this only fires if source code was actually saved + transform_r2.source_code = "something" + transform_r2.save() + transform_r3 = ln.Transform(description="My transform", key=key, version="2") + assert transform_r3.uid[:-4] == transform_r2.uid[:-4] assert transform_r3.uid.endswith("0002") assert transform_r3.stem_uid == transform_r2.stem_uid assert transform_r3.key == key @@ -81,7 +97,7 @@ def test_revise_transforms(): with pytest.raises(ValueError) as error: ln.Transform(x=1) assert ( - error.exconly() == "ValueError: Only name, key, version, type, revises," + error.exconly() == "ValueError: Only key, description, version, type, revises," " reference, reference_type can be passed, but you passed: {'x': 1}" ) @@ -98,7 +114,7 @@ def test_revise_transforms(): transform.save() # create new transform from old transform - new_transform = ln.Transform(name="My new transform", revises=transform) + new_transform = ln.Transform(description="My new transform", revises=transform) assert transform.version is None assert new_transform.stem_uid == transform.stem_uid assert new_transform.uid.endswith("0001") @@ -109,51 +125,29 @@ def test_revise_transforms(): def test_delete(): # prepare the creation of a transform with its artifacts - transform = ln.Transform(name="My transform") - transform.save() + transform = ln.Transform(name="My transform").save() run = ln.Run(transform) report_path = Path("report.html") with open(report_path, "w") as f: f.write("a") - _source_code_artifact_path = Path("code.py") - with open(_source_code_artifact_path, "w") as f: - f.write("b") environment_path = Path("environment.txt") with open(environment_path, "w") as f: f.write("c") - report = ln.Artifact(report_path, description=f"Report of {run.uid}") - report.save() + report = ln.Artifact(report_path, description=f"Report of {run.uid}").save() report_path.unlink() report_path = report.path - _source_code_artifact = ln.Artifact( - _source_code_artifact_path, description=f"Source of {transform.uid}" - ) - _source_code_artifact.save() - _source_code_artifact_path.unlink() - _source_code_artifact_path = _source_code_artifact.path - environment = ln.Artifact(environment_path, description="requirement.txt") - environment.save() + environment = ln.Artifact(environment_path, description="requirements.txt").save() environment_path.unlink() environment_path = environment.path - transform._source_code_artifact = _source_code_artifact transform.save() run.report = report run.environment = environment run.save() assert report_path.exists() - assert _source_code_artifact_path.exists() assert environment_path.exists() # now delete everything transform.delete() assert not report_path.exists() - assert not _source_code_artifact_path.exists() assert not environment_path.exists() - assert ( - len( - ln.Artifact.filter( - id__in=[report.id, _source_code_artifact.id, environment.id] - ).all() - ) - == 0 - ) + assert len(ln.Artifact.filter(id__in=[report.id, environment.id]).all()) == 0 assert len(ln.Run.filter(id=run.id).all()) == 0 diff --git a/tests/core/test_types.py b/tests/core/test_types.py deleted file mode 100644 index a692dd554..000000000 --- a/tests/core/test_types.py +++ /dev/null @@ -1,7 +0,0 @@ -from lamindb.base.types import VisibilityChoice - - -def test_visibility_choice(): - assert VisibilityChoice.default == 1 - assert VisibilityChoice.hidden == 0 - assert VisibilityChoice.trash == -1 diff --git a/tests/core/test_versioning.py b/tests/core/test_versioning.py index 554a268de..01a2435d0 100644 --- a/tests/core/test_versioning.py +++ b/tests/core/test_versioning.py @@ -88,37 +88,36 @@ def test_get_new_path_from_uid(): def test_latest_version_and_get(): # build one version family - transform_v1 = ln.Transform(name="Introduction") - transform_v1.save() + transform_v1 = ln.Transform(description="Introduction").save() assert transform_v1.is_latest assert transform_v1.version is None # pass the latest version, also vary the name for the fun of it transform_v2 = ln.Transform( - name="Introduction v2", revises=transform_v1, version="2" - ) - transform_v2.save() + description="Introduction v2", revises=transform_v1, version="2" + ).save() assert not transform_v1.is_latest assert transform_v2.is_latest assert transform_v2.uid.endswith("0001") # consciously *not* pass the latest version to revises but the previous # it automatically retrieves the latest version - transform_v3 = ln.Transform(name="Introduction", revises=transform_v1) - transform_v3.save() + transform_v3 = ln.Transform(description="Introduction", revises=transform_v1).save() assert transform_v3.uid.endswith("0002") - assert not ln.Transform.objects.get(name="Introduction v2", version="2").is_latest + assert not ln.Transform.objects.get( + description="Introduction v2", version="2" + ).is_latest assert transform_v3.is_latest - transform_v4 = ln.Transform(name="Introduction") - transform_v4.save() + transform_v4 = ln.Transform(description="Introduction").save() assert transform_v4.is_latest # add another transform with the same name that's not part of this family # but will also be a hit for the query - assert len(ln.Transform.filter(name="Introduction").all()) == 3 - assert len(ln.Transform.filter(name="Introduction").latest_version()) == 2 + assert len(ln.Transform.filter(description="Introduction").all()) == 3 + assert len(ln.Transform.filter(description="Introduction").latest_version()) == 2 transform_v4.delete() with pytest.raises(Exception): # noqa: B017 should be MultipleResultsFound - ln.Transform.get(name="Introduction") + ln.Transform.get(description="Introduction") assert ( - ln.Transform.filter(name="Introduction").latest_version().one() == transform_v3 + ln.Transform.filter(description="Introduction").latest_version().one() + == transform_v3 ) # test get @@ -132,6 +131,8 @@ def test_latest_version_and_get(): # test empty QuerySet assert ( - ln.Transform.filter(name="IntroductionNotExists").latest_version().one_or_none() + ln.Transform.filter(description="IntroductionNotExists") + .latest_version() + .one_or_none() is None ) diff --git a/tests/core/test_visibility.py b/tests/core/test_visibility.py index 812da691e..732e91e02 100644 --- a/tests/core/test_visibility.py +++ b/tests/core/test_visibility.py @@ -1,35 +1,37 @@ import lamindb as ln -def test_file_visibility(): - # create a file with default visibility - with open("./test-visibility.txt", "w") as f: - f.write("visibility") - artifact = ln.Artifact("./test-visibility.txt", description="test-visibility") - assert artifact.visibility == 1 +def test_file__branch_code(): + # create a file with default _branch_code + with open("./test-_branch_code.txt", "w") as f: + f.write("_branch_code") + artifact = ln.Artifact("./test-_branch_code.txt", description="test-_branch_code") + assert artifact._branch_code == 1 artifact.save() # create a collection from file - collection = ln.Collection(artifact, name="test-visibility") + collection = ln.Collection(artifact, name="test-_branch_code") collection.save() # delete a collection will put both collection but not linked artifact in trash collection.delete() - assert collection.ordered_artifacts[0].visibility == 1 - result = ln.Collection.filter(name="test-visibility").all() + assert collection.ordered_artifacts[0]._branch_code == 1 + result = ln.Collection.filter(name="test-_branch_code").all() assert len(result) == 0 - result = ln.Collection.filter(name="test-visibility", visibility=1).all() + result = ln.Collection.filter(name="test-_branch_code", _branch_code=1).all() assert len(result) == 0 - result = ln.Collection.filter(name="test-visibility", visibility=None).all() + result = ln.Collection.filter(name="test-_branch_code", _branch_code=None).all() assert len(result) == 1 # restore collection.restore() - assert collection.visibility == 1 - assert collection.ordered_artifacts[0].visibility == 1 + assert collection._branch_code == 1 + assert collection.ordered_artifacts[0]._branch_code == 1 # permanent delete collection.delete(permanent=True) - result = ln.Artifact.filter(description="test-visibility", visibility=None).all() + result = ln.Artifact.filter( + description="test-_branch_code", _branch_code=None + ).all() # also permanently deleted linked file assert len(result) == 1 diff --git a/tests/storage/test_storage.py b/tests/storage/test_storage.py index 6fd6a8526..4d2b304b1 100644 --- a/tests/storage/test_storage.py +++ b/tests/storage/test_storage.py @@ -259,7 +259,7 @@ def test_write_read_tiledbsoma(storage): assert artifact_soma.path.stem == artifact_soma.uid[:16] assert artifact_soma.key == "scrna/my-big-dataset" assert artifact_soma._key_is_virtual - assert artifact_soma._accessor == "tiledbsoma" + assert artifact_soma.otype == "tiledbsoma" assert artifact_soma.n_observations == adata.n_obs with artifact_soma.open() as store: # mode="r" by default diff --git a/tests/storage/test_transfer.py b/tests/storage/test_transfer.py index b652d2785..f4bb373d2 100644 --- a/tests/storage/test_transfer.py +++ b/tests/storage/test_transfer.py @@ -1,3 +1,4 @@ +import bionty as bt import lamindb as ln import pytest from lamindb.core._django import get_artifact_with_related @@ -5,7 +6,6 @@ def test_transfer_from_remote_to_local(): """Test transfer from remote to local instance.""" - import bionty as bt bt.Gene.filter().delete() bt.Organism.filter().delete()