Merge branch 'main' into mudata-type-checking

laminlabs · Jul 17, 2024 · 4283a54 · 4283a54
2 parents 7aa1ac5 + 30c3ead
commit 4283a54
Show file tree

Hide file tree

Showing 22 changed files with 107 additions and 58 deletions.
diff --git a/lamindb/_annotate.py b/lamindb/_annotate.py
@@ -1085,7 +1085,7 @@ def save_artifact(
 
     artifact = None
     if data_is_anndata(data):
-        assert adata is not None
+        assert adata is not None  # noqa: S101
         artifact = Artifact.from_anndata(data, description=description, **kwargs)
         artifact.n_observations = adata.shape[0]
         data = adata
@@ -1306,7 +1306,7 @@ def log_saved_labels(
 def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
     """Save a parent label for the given labels."""
     registry = field.field.model
-    assert registry == ULabel
+    assert registry == ULabel  # noqa: S101
     all_records = registry.from_values(values, field=field)
     is_feature = registry.filter(name=f"is_{key}").one_or_none()
     if is_feature is None:

diff --git a/lamindb/_artifact.py b/lamindb/_artifact.py
@@ -424,7 +424,7 @@ def log_storage_hint(
     logger.hint(hint)
 
 
-def data_is_anndata(data: AnnData | UPathStr):
+def data_is_anndata(data: AnnData | UPathStr) -> bool:
     if isinstance(data, AnnData):
         return True
     if isinstance(data, (str, Path, UPath)):
@@ -444,7 +444,7 @@ def data_is_anndata(data: AnnData | UPathStr):
     return False
 
 
-def data_is_mudata(data: MuData | UPathStr):
+def data_is_mudata(data: MuData | UPathStr) -> bool:
     if _mudata_is_installed():
         from mudata import MuData
 
@@ -948,7 +948,7 @@ def delete(
         )
         delete_record = response == "y"
     else:
-        assert permanent
+        assert permanent  # noqa: S101
         delete_record = True
 
     if delete_record:

diff --git a/lamindb/_can_validate.py b/lamindb/_can_validate.py
@@ -223,8 +223,10 @@ def set_abbr(self, value: str):
     else:
         try:
             self.add_synonym(value, save=False)
-        except Exception:  # pragma: no cover
-            pass
+        except Exception as e:  # pragma: no cover
+            logger.debug(
+                f"Encountered an Exception while attempting to add synonyms.\n{e}"
+            )
 
     if not self._state.adding:
         self.save()

diff --git a/lamindb/_collection.py b/lamindb/_collection.py
@@ -100,7 +100,7 @@ def __init__(
     else:
         if not hasattr(artifacts, "__getitem__"):
             raise ValueError("Artifact or List[Artifact] is allowed.")
-        assert isinstance(artifacts[0], Artifact)  # type: ignore
+        assert isinstance(artifacts[0], Artifact)  # type: ignore  # noqa: S101
     hash, feature_sets = from_artifacts(artifacts)  # type: ignore
     if meta is not None:
         if not isinstance(meta, Artifact):

diff --git a/lamindb/_feature.py b/lamindb/_feature.py
@@ -139,7 +139,7 @@ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordsLis
     finally:
         settings.verbosity = verbosity
 
-    assert len(features) == len(df.columns)
+    assert len(features) == len(df.columns)  # noqa: S101
 
     # if len(categoricals_with_unmapped_categories) > 0:
     #     n_max = 20

diff --git a/lamindb/_finish.py b/lamindb/_finish.py
@@ -41,7 +41,10 @@ def finish() -> None:
     if run_context.run is None:
         raise TrackNotCalled("Please run `ln.track()` before `ln.finish()`")
     if run_context.path is None:
-        assert run_context.transform.type not in {"script", "notebook"}
+        if run_context.transform.type in {"script", "notebook"}:
+            raise ValueError(
+                f"Transform type must be 'script' or 'notebook' but is {run_context.transform.type}."
+            )
         run_context.run.finished_at = datetime.now(timezone.utc)
         run_context.run.save()
         # nothing else to do
@@ -107,9 +110,14 @@ def save_run_context_core(
         # convert the notebook file to html
         # log_level is set to 40 to silence the nbconvert logging
         subprocess.run(
-            "jupyter nbconvert --to html"
-            f" '{filepath.as_posix()}' --Application.log_level=40",
-            shell=True,
+            [
+                "jupyter",
+                "nbconvert",
+                "--to",
+                "html",
+                filepath.as_posix(),
+                "--Application.log_level=40",
+            ],
             check=True,
         )
         # move the temporary file into the cache dir in case it's accidentally
@@ -129,8 +137,12 @@ def save_run_context_core(
         source_code_path = ln_setup.settings.storage.cache_dir / filepath.name
         shutil.copy2(filepath, source_code_path)  # copy
         subprocess.run(
-            f"nbstripout '{source_code_path}' --extra-keys='metadata.version metadata.kernelspec metadata.language_info metadata.pygments_lexer metadata.name metadata.file_extension'",
-            shell=True,
+            [
+                "nbstripout",
+                source_code_path,
+                "--extra-keys",
+                "metadata.version metadata.kernelspec metadata.language_info metadata.pygments_lexer metadata.name metadata.file_extension",
+            ],
             check=True,
         )
     # find initial versions of source codes and html reports

diff --git a/lamindb/core/_feature_manager.py b/lamindb/core/_feature_manager.py
@@ -432,7 +432,7 @@ def _add_values(
     if isinstance(keys, DICT_KEYS_TYPE):
         keys = list(keys)  # type: ignore
     # deal with other cases later
-    assert all(isinstance(key, str) for key in keys)
+    assert all(isinstance(key, str) for key in keys)  # noqa: S101
     registry = feature_param_field.field.model
     is_param = registry == Param
     model = Param if is_param else Feature
@@ -668,10 +668,10 @@ def _add_set_from_df(
 ):
     """Add feature set corresponding to column names of DataFrame."""
     if isinstance(self._host, Artifact):
-        assert self._host.accessor == "DataFrame"
+        assert self._host.accessor == "DataFrame"  # noqa: S101
     else:
         # Collection
-        assert self._host.artifact.accessor == "DataFrame"
+        assert self._host.artifact.accessor == "DataFrame"  # noqa: S101
 
     # parse and register features
     registry = field.field.model
@@ -699,7 +699,7 @@ def _add_set_from_anndata(
 ):
     """Add features from AnnData."""
     if isinstance(self._host, Artifact):
-        assert self._host.accessor == "AnnData"
+        assert self._host.accessor == "AnnData"  # noqa: S101
     else:
         raise NotImplementedError()
 
@@ -729,7 +729,7 @@ def _add_set_from_mudata(
     if obs_fields is None:
         obs_fields = {}
     if isinstance(self._host, Artifact):
-        assert self._host.accessor == "MuData"
+        assert self._host.accessor == "MuData"  # noqa: S101
     else:
         raise NotImplementedError()
 

diff --git a/lamindb/core/_label_manager.py b/lamindb/core/_label_manager.py
@@ -66,7 +66,7 @@ def print_labels(self: HasFeatures, field: str = "name", print_types: bool = Fal
                 print_values = _print_values(labels_list, n=10)
                 type_str = f": {related_model}" if print_types else ""
                 labels_msg += f"    .{related_name}{type_str} = {print_values}\n"
-        except Exception:
+        except Exception:  # noqa: S112
             continue
     msg = ""
     if labels_msg:
@@ -102,7 +102,7 @@ def validate_labels_registry(
                 records = registry.from_values(label_uids, field=field)
                 if len(records) > 0:
                     save(records, parents=parents)
-            except Exception:
+            except Exception:  # noqa S110
                 pass
             field = "uid"
             label_uids = np.array(

diff --git a/lamindb/core/_mapped_collection.py b/lamindb/core/_mapped_collection.py
@@ -107,7 +107,10 @@ def __init__(
         parallel: bool = False,
         dtype: str | None = None,
     ):
-        assert join in {None, "inner", "outer"}
+        if join not in {None, "inner", "outer"}:  # pragma: nocover
+            raise ValueError(
+                f"join must be one of None, 'inner, or 'outer' but was {type(join)}"
+            )
 
         if layers_keys is None:
             self.layers_keys = ["X"]

diff --git a/lamindb/core/_run_context.py b/lamindb/core/_run_context.py
@@ -43,7 +43,7 @@ def get_uid_ext(version: str) -> str:
     # merely zero-padding the nbproject version such that the base62 encoding is
     # at least 4 characters long doesn't yields sufficiently diverse hashes and
     # leads to collisions; it'd be nice because the uid_ext would be ordered
-    return encodebytes(hashlib.md5(version.encode()).digest())[:4]
+    return encodebytes(hashlib.md5(version.encode()).digest())[:4]  # noqa: S324
 
 
 def update_stem_uid_or_version(
@@ -113,7 +113,7 @@ def get_notebook_name_colab() -> str:
 
     ip = gethostbyname(gethostname())  # 172.28.0.12
     try:
-        name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"]
+        name = get(f"http://{ip}:9000/api/sessions").json()[0]["name"]  # noqa: S113
     except Exception:
         logger.warning(
             "could not get notebook name from Google Colab, using: notebook.ipynb"

diff --git a/lamindb/core/_settings.py b/lamindb/core/_settings.py
@@ -108,7 +108,8 @@ def sync_git_repo(self, value) -> None:
         For example: `ln.sync_git_repo = https://github.com/laminlabs/redun-lamin`
         """
         self._sync_git_repo = sanitize_git_repo_url(value)
-        assert self._sync_git_repo.startswith("https://")
+        if not self._sync_git_repo.startswith("https://"):  # pragma: nocover
+            raise ValueError("git repository URL must start with 'https://'.")
 
     @property
     def storage(self) -> StorageSettings:

diff --git a/lamindb/core/_sync_git.py b/lamindb/core/_sync_git.py
@@ -24,8 +24,7 @@ def get_git_repo_from_remote() -> Path:
         f"running outside of synched git repo, cloning {repo_url} into {repo_dir}"
     )
     result = subprocess.run(
-        f"git clone --depth 10 {repo_url}.git",
-        shell=True,
+        ["git", "clone", "--depth", "10", f"{repo_url}.git"],
         capture_output=True,
         cwd=setup_settings.storage.cache_dir,
     )
@@ -36,8 +35,7 @@ def get_git_repo_from_remote() -> Path:
 
 def check_local_git_repo() -> bool:
     result = subprocess.run(
-        "git config --get remote.origin.url",
-        shell=True,
+        ["git", "config", "--get remote.origin.url"],
         capture_output=True,
     )
     result_str = result.stdout.decode().strip()
@@ -55,10 +53,9 @@ def check_local_git_repo() -> bool:
 
 
 def get_git_commit_hash(blob_hash: str, repo_dir: Path | None = None) -> str | None:
-    command = f"git log --find-object={blob_hash} --pretty=format:%H"
+    command = ["git", "log", f"--find-object={blob_hash}", "--pretty=format:%H"]
     result = subprocess.run(
         command,
-        shell=True,
         capture_output=True,
         cwd=repo_dir,
     )
@@ -68,9 +65,6 @@ def get_git_commit_hash(blob_hash: str, repo_dir: Path | None = None) -> str | N
     if commit_hash == "" or result.returncode == 1:
         return None
     else:
-        assert (
-            len(commit_hash) == 40
-        ), f"commit hash |{commit_hash}| is not 40 characters long"
         return commit_hash
 
 
@@ -82,21 +76,34 @@ def get_filepath_within_git_repo(
     # from anywhere in the repo, hence, let's get the root
     repo_root = (
         subprocess.run(
-            "git rev-parse --show-toplevel",
-            shell=True,
+            ["git", "rev-parse", "--show-toplevel"],
             capture_output=True,
             cwd=repo_dir,
         )
         .stdout.decode()
         .strip()
     )
-    command = f"git ls-tree -r {commit_hash} | grep -E {blob_hash}"
+    # Run the git commands separately to circumvent spawning a shell
+    git_command = ["git", "ls-tree", "-r", commit_hash]
+    git_process = subprocess.Popen(
+        git_command,
+        stdout=subprocess.PIPE,
+        cwd=repo_root,
+    )
+
+    grep_command = ["grep", "-E", blob_hash]
     result = subprocess.run(
-        command,
-        shell=True,
+        grep_command,
+        stdin=git_process.stdout,
         capture_output=True,
         cwd=repo_root,
     )
+
+    # Close the stdout to allow git_process to receive a SIGPIPE if grep_command exits
+    git_process.stdout.close()
+    git_process.wait()
+
+    command = " ".join(git_command) + " | " + " ".join(grep_command)
     if result.returncode != 0 and result.stderr.decode() != "":
         raise RuntimeError(f"{command}\n{result.stderr.decode()}")
     if len(result.stdout.decode()) == 0:

diff --git a/lamindb/core/_track_environment.py b/lamindb/core/_track_environment.py
@@ -15,7 +15,11 @@ def track_environment(run: Run) -> None:
     # create a requirements.txt
     # we don't create a conda environment.yml mostly for its slowness
     try:
-        result = subprocess.run(f"pip freeze > {str(filepath)}", shell=True)
+        with open(filepath, "w") as f:
+            result = subprocess.run(
+                ["pip", "freeze"],
+                stdout=f,
+            )
     except OSError as e:
         result = None
         logger.warning(f"could not run pip freeze with error {e}")

diff --git a/lamindb/core/storage/_backed_access.py b/lamindb/core/storage/_backed_access.py
@@ -209,8 +209,10 @@ def safer_read_partial(elem, indices):
             try:
                 ds = CSRDataset(elem)
                 result = _subset_sparse(ds, indices)
-            except Exception:
-                pass
+            except Exception as e:
+                logger.debug(
+                    f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
+                )
         if result is None:
             raise ValueError(
                 "Can not get a subset of the element of type"
@@ -307,8 +309,10 @@ def safer_read_partial(elem, indices):  # noqa
                 try:
                     ds = CSRDataset(elem)
                     return _subset_sparse(ds, indices)
-                except Exception:
-                    pass
+                except Exception as e:
+                    logger.debug(
+                        f"Encountered an exception while attempting to subset a sparse dataset by indices.\n{e}"
+                    )
             raise ValueError(
                 "Can not get a subset of the element of type"
                 f" {type(elem).__name__} with an empty spec."

diff --git a/lamindb/core/storage/paths.py b/lamindb/core/storage/paths.py
@@ -47,7 +47,7 @@ def auto_storage_key_from_artifact(artifact: Artifact):
 
 
 def auto_storage_key_from_artifact_uid(uid: str, suffix: str, is_dir: bool) -> str:
-    assert isinstance(suffix, str)
+    assert isinstance(suffix, str)  # noqa: S101 Suffix cannot be None.
     if is_dir:
         uid_storage = uid[:16]  # 16 chars, leave 4 chars for versioning
     else:

diff --git a/lamindb/integrations/_vitessce.py b/lamindb/integrations/_vitessce.py
@@ -39,9 +39,12 @@ def save_vitessce_config(vitessce_config: VitessceConfig, description: str) -> A
             if "url" not in file:
                 raise ValueError("Each file must have a 'url' key.")
             filename = file["url"].split("/")[-1]
-            assert filename.endswith(
+            if not filename.endswith(
                 (".anndata.zarr", ".spatialdata.zarr", ".ome.zarr")
-            )
+            ):
+                raise ValueError(
+                    "filename must end with '.anndata.zarr', '.spatialdata.zarr', or '.ome.zarr'."
+                )
             filestem = (
                 filename.replace(".anndata.zarr", "")
                 .replace(".spatialdata.zarr", "")