Merge branch 'main' into rename_name

laminlabs · Jul 15, 2024 · fd06cff · fd06cff
2 parents a8c5a20 + 76c11f4
commit fd06cff
Show file tree

Hide file tree

Showing 15 changed files with 36 additions and 69 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -22,7 +22,7 @@ jobs:
           - "faq"
           - "storage"
           - "cli"
-    timeout-minutes: 6
+    timeout-minutes: 5
 
     steps:
       - uses: actions/checkout@v4
@@ -55,7 +55,7 @@ jobs:
       - name: cache postgres use
         if: ${{ steps.cache-postgres.outputs.cache-hit == 'true' }}
         run: docker image load --input ~/postgres.tar
-      - run: pip install -U laminci
+      - run: pip install "laminci@git+https://x-access-token:${{ secrets.LAMIN_BUILD_DOCS }}@github.com/laminlabs/laminci"
       - name: install postgres
         if: ${{ matrix.group == 'faq' }}
         run: sudo apt-get install libpq-dev

diff --git a/docs/annotate.ipynb b/docs/annotate.ipynb
@@ -64,9 +64,7 @@
     "import lamindb as ln\n",
     "import bionty as bt\n",
     "import pandas as pd\n",
-    "import anndata as ad\n",
-    "\n",
-    "ln.settings.verbosity = \"hint\""
+    "import anndata as ad"
    ]
   },
   {
@@ -150,8 +148,7 @@
     "# create an object to guide validation and annotation\n",
     "annotate = ln.Annotate.from_df(df, categoricals=categoricals)\n",
     "# validate\n",
-    "validated = annotate.validate()\n",
-    "validated"
+    "annotate.validate()"
    ]
   },
   {
@@ -161,9 +158,9 @@
    "source": [
     "## Validate using registries in another instance\n",
     "\n",
-    "Sometimes you want to validate against other existing registries, for instance [cellxgene](https://lamin.ai/laminlabs/cellxgene).\n",
+    "Sometimes you want to validate against other existing registries, for instance [cellxgene](https://lamin.ai/laminlabs/lamindata).\n",
     "\n",
-    "This allows us to directly transfer values that are currently missing in our registries from the [cellxgene instance](https://lamin.ai/laminlabs/cellxgene)."
+    "This allows us to directly transfer values that are currently missing in our registries from the [cellxgene instance](https://lamin.ai/laminlabs/lamindata)."
    ]
   },
   {
@@ -198,7 +195,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "annotate.add_validated_from(df.cell_type.name)"
+    "# this adds cell types that were validated via the public ontology\n",
+    "annotate.add_validated_from(\"cell_type\")"
    ]
   },
   {

diff --git a/docs/faq/idempotency.ipynb b/docs/faq/idempotency.ipynb
@@ -527,7 +527,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "artifact4.filter(hash=\"KCEXRahJ-Ui9Y6nksQ8z1A\").df()"
+    "ln.Artifact.filter(hash=\"KCEXRahJ-Ui9Y6nksQ8z1A\").df()"
    ]
   },
   {
@@ -541,7 +541,7 @@
    },
    "outputs": [],
    "source": [
-    "assert len(artifact.filter(hash=\"KCEXRahJ-Ui9Y6nksQ8z1A\").list()) == 2"
+    "assert len(ln.Artifact.filter(hash=\"KCEXRahJ-Ui9Y6nksQ8z1A\").all()) == 2"
    ]
   },
   {
@@ -555,8 +555,7 @@
    },
    "outputs": [],
    "source": [
-    "!lamin delete --force test-idempotency\n",
-    "!rm -r test-idempotency"
+    "!lamin delete --force test-idempotency"
    ]
   }
  ],

diff --git a/docs/introduction.ipynb b/docs/introduction.ipynb
@@ -74,7 +74,9 @@
    "outputs": [],
    "source": [
     "# store artifacts in a local directory `./lamin-intro`\n",
-    "!lamin init --storage ./lamin-intro --schema bionty"
+    "!lamin init --storage ./lamin-intro --schema bionty\n",
+    "# disable django's unnecessary functionality for a clean API\n",
+    "!lamin set private-django-api true"
    ]
   },
   {
@@ -1087,13 +1089,13 @@
     "\n",
     "<img src=\"https://lamin-site-assets.s3.amazonaws.com/.lamindb/BunYmHkyFLITlM5MYQci.svg\" width=\"350px\" style=\"background: transparent\" align=\"right\">\n",
     "\n",
-    "The complexity of modern R&D data often blocks realizing the scientific progress it promises: see this [blog post](https://lamin.ai/blog/problems).\n",
-    "\n",
-    "More basically: The pydata family of objects is at the heart of most data science, ML & comp bio workflows: `DataFrame`, `AnnData`, `pytorch.DataLoader`, `zarr.Array`, `pyarrow.Table`, `xarray.Collection`, etc. We couldn’t find a tool to link these objects to context so that they could be analyzed in context:\n",
+    "Objects like `pd.DataFrame` are at the heart of many data science workflows but there hasn't been a tool to manage these objects in the rich context that collaborative biological research requires:\n",
     "\n",
     "- provenance: data sources, data transformations, models, users\n",
     "- domain knowledge & experimental metadata: the features & labels derived from domain entities\n",
     "\n",
+    "In this [blog post](https://lamin.ai/blog/problems), we discuss how the complexity of modern R&D data often blocks realizing the scientific progress it promises.\n",
+    "\n",
     "### Assumptions\n",
     "\n",
     "1. Batched datasets from physical instruments are transformed ({class}`~lamindb.Transform`) into useful representations ({class}`~lamindb.Artifact`)\n",

diff --git a/docs/transfer.ipynb b/docs/transfer.ipynb
@@ -14,16 +14,6 @@
     "Here, we'll show how to transfer data from another instance into the current instance."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Install the `lamindb` Python package:\n",
-    "```shell\n",
-    "pip install 'lamindb[jupyter,aws,bionty]'\n",
-    "```"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -34,6 +24,7 @@
    },
    "outputs": [],
    "source": [
+    "# !pip install 'lamindb[jupyter,aws,bionty]'\n",
     "!lamin init --storage ./test-transfer --schema bionty"
    ]
   },
@@ -58,7 +49,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "All artifacts in the `laminlabs/cellxgene` clone of CZ CELLxGENE (for more info, see [cellxgene](inv:docs#cellxgene)):"
+    "All artifacts in the `laminlabs/lamindata` clone of CZ CELLxGENE (for more info, see [cellxgene](inv:docs#cellxgene)):"
    ]
   },
   {
@@ -71,7 +62,7 @@
    },
    "outputs": [],
    "source": [
-    "artifacts = ln.Artifact.using(\"laminlabs/cellxgene\")\n",
+    "artifacts = ln.Artifact.using(\"laminlabs/lamindata\")\n",
     "artifacts.df().head()"
    ]
   },

diff --git a/lamindb/_artifact.py b/lamindb/_artifact.py
@@ -1,22 +1,19 @@
 from __future__ import annotations
 
-import os
 import shutil
-from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path, PurePath, PurePosixPath
 from typing import TYPE_CHECKING, Any, Mapping
 
 import fsspec
 import lamindb_setup as ln_setup
 import pandas as pd
-import psutil
 from anndata import AnnData
 from lamin_utils import colors, logger
 from lamindb_setup import settings as setup_settings
 from lamindb_setup._init_instance import register_storage_in_instance
 from lamindb_setup.core._docs import doc_args
 from lamindb_setup.core._settings_storage import init_storage
-from lamindb_setup.core.hashing import b16_to_b64, hash_file, hash_md5s_from_dir
+from lamindb_setup.core.hashing import hash_dir, hash_file
 from lamindb_setup.core.upath import (
     create_path,
     extract_suffix_from_path,
@@ -207,26 +204,7 @@ def get_stat_or_artifact(
             return size, hash, hash_type, n_objects
     else:
         if path.is_dir():
-            files = (subpath for subpath in path.rglob("*") if subpath.is_file())
-
-            def hash_size(file):
-                file_size = file.stat().st_size
-                return hash_file(file, file_size)[0], file_size
-
-            try:
-                n_workers = len(psutil.Process().cpu_affinity())
-            except AttributeError:
-                n_workers = psutil.cpu_count()
-            if n_workers > 1:
-                with ThreadPoolExecutor(n_workers) as pool:
-                    hashes_sizes = pool.map(hash_size, files)
-            else:
-                hashes_sizes = map(hash_size, files)
-            hashes, sizes = zip(*hashes_sizes)
-
-            hash, hash_type = hash_md5s_from_dir(hashes)
-            n_objects = len(hashes)
-            size = sum(sizes)
+            size, hash, hash_type, n_objects = hash_dir(path)
         else:
             hash, hash_type = hash_file(path)
             size = stat.st_size
@@ -1106,5 +1084,3 @@ def restore(self) -> None:
 Artifact._delete_skip_storage = _delete_skip_storage
 Artifact._save_skip_storage = _save_skip_storage
 Artifact.path = path
-# this seems a Django-generated function
-delattr(Artifact, "get_visibility_display")
diff --git a/lamindb/_collection.py b/lamindb/_collection.py
@@ -390,7 +390,5 @@ def artifacts(self) -> QuerySet:
 for name in METHOD_NAMES:
     attach_func_to_class_method(name, Collection, globals())
 
-# this seems a Django-generated function
-delattr(Collection, "get_visibility_display")
 Collection.artifacts = artifacts
 Collection.stage = cache
diff --git a/lamindb/_feature_set.py b/lamindb/_feature_set.py
@@ -64,7 +64,7 @@ def __init__(self, *args, **kwargs):
     dtype: str | None = kwargs.pop("dtype") if "dtype" in kwargs else None
     name: str | None = kwargs.pop("name") if "name" in kwargs else None
     if len(kwargs) > 0:
-        raise ValueError("Only features, type, name are valid keyword arguments")
+        raise ValueError("Only features, dtype, name are valid keyword arguments")
     # now code
     features_registry = validate_features(features)
     if dtype is None:

diff --git a/lamindb/_is_versioned.py b/lamindb/_is_versioned.py
@@ -11,7 +11,7 @@
 
 
 # docstring handled through attach_func_to_class_method
-def add_to_version_family(
+def _add_to_version_family(
     self, is_new_version_of: IsVersioned, version: str | None = None
 ):
     old_uid = self.uid
@@ -30,7 +30,7 @@ def add_to_version_family(
 
 
 METHOD_NAMES = [
-    "add_to_version_family",
+    "_add_to_version_family",
 ]
 
 if ln_setup._TESTING:  # type: ignore

diff --git a/lamindb/_registry.py b/lamindb/_registry.py
@@ -87,12 +87,12 @@ def __init__(orm: Registry, *args, **kwargs):
             if match:
                 if "version" in kwargs:
                     version_comment = " and version"
-                    existing_record = orm.filter(
+                    existing_record = orm.__class__.filter(
                         name=kwargs["name"], version=kwargs["version"]
                     ).one_or_none()
                 else:
                     version_comment = ""
-                    existing_record = orm.filter(name=kwargs["name"]).one()
+                    existing_record = orm.__class__.filter(name=kwargs["name"]).one()
                 if existing_record is not None:
                     logger.important(
                         f"returning existing {orm.__class__.__name__} record with same"

diff --git a/lamindb/core/_feature_manager.py b/lamindb/core/_feature_manager.py
@@ -118,7 +118,9 @@ def get_feature_set_links(host: Artifact | Collection) -> QuerySet:
 
 def get_link_attr(link: LinkORM | type[LinkORM], data: HasFeatures) -> str:
     link_model_name = link.__class__.__name__
-    if link_model_name == "ModelBase":  # we passed the type of the link
+    if (
+        link_model_name == "ModelBase" or link_model_name == "RegistryMeta"
+    ):  # we passed the type of the link
         link_model_name = link.__name__
     link_attr = link_model_name.replace(data.__class__.__name__, "").lower()
     return link_attr
@@ -592,7 +594,7 @@ def _add_values(
         links = [
             LinkORM(
                 **{
-                    f"{self._host.__get_name_with_schema__().lower()}_id": self._host.id,
+                    f"{self._host.__class__.__get_name_with_schema__().lower()}_id": self._host.id,
                     valuefield_id: feature_value.id,
                 }
             )

diff --git a/noxfile.py b/noxfile.py
@@ -106,6 +106,7 @@ def build(session, group):
 
     login_testuser2(session)
     login_testuser1(session)
+    run(session, "lamin set private-django-api true")
     coverage_args = "--cov=lamindb --cov-config=pyproject.toml --cov-append --cov-report=term-missing"
     if group == "unit":
         run(session, f"pytest {coverage_args} ./tests")

diff --git a/sub/lamin-cli b/sub/lamin-cli
diff --git a/sub/lamindb-setup b/sub/lamindb-setup
diff --git a/tests/test_versioning.py b/tests/test_versioning.py
@@ -40,7 +40,7 @@ def test_bump_version():
     assert bump_version(current_version_major_minor, bump_type="minor") == "2.2"
 
 
-def test_add_to_version_family(df1, df2):
+def test__add_to_version_family(df1, df2):
     artifact1 = ln.Artifact.from_df(df1, description="test1")
     artifact1.save()
     artifact2 = ln.Artifact.from_df(df2, description="test2")
@@ -49,7 +49,7 @@ def test_add_to_version_family(df1, df2):
         artifact1.uid[: artifact1._len_stem_uid]
         != artifact2.uid[: artifact2._len_stem_uid]
     )
-    artifact2.add_to_version_family(artifact1)
+    artifact2._add_to_version_family(artifact1)
     assert (
         artifact1.uid[: artifact1._len_stem_uid]
         == artifact2.uid[: artifact2._len_stem_uid]
+23 −0		.github/workflows/doc-changes.yml
+6 −1		lamin_cli/__main__.py
+8 −8		.github/workflows/build.yml
+83 −0		lamindb_setup/core/_private_django_api.py
+33 −4		lamindb_setup/core/_settings.py
+29 −2		lamindb_setup/core/hashing.py
+4 −1		noxfile.py
+0 −12		tests/hub-prod/test_auto_connect.py
+55 −0		tests/hub-prod/test_global_settings.py