Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Integrate lamindb with git #1493

Merged
merged 26 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions lamindb/_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@ def __init__(transform: Transform, *args, **kwargs):
reference: Optional[str] = (
kwargs.pop("reference") if "reference" in kwargs else None
)
reference_type: Optional[str] = (
kwargs.pop("reference_type") if "reference_type" in kwargs else None
)
# below is internal use that we'll hopefully be able to eliminate
uid: Optional[str] = kwargs.pop("uid") if "uid" in kwargs else None
if not len(kwargs) == 0:
raise ValueError(
"Only name, short_name, version, type, is_new_version_of can be passed,"
f" but you passed: {kwargs}"
"Only name, short_name, version, type, is_new_version_of, reference, "
f"reference_type can be passed, but you passed: {kwargs}"
)
if is_new_version_of is None:
new_uid = init_uid(version=version, n_full_id=Transform._len_full_uid)
Expand All @@ -56,6 +59,7 @@ def __init__(transform: Transform, *args, **kwargs):
type=type,
version=version,
reference=reference,
reference_type=reference_type,
_has_consciously_provided_uid=has_consciously_provided_uid,
)

Expand Down
69 changes: 45 additions & 24 deletions lamindb/core/_run_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,25 @@
import hashlib
import os
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path, PurePath
from typing import Any, Dict, List, Optional, Tuple, Union

from lamin_utils import logger
from lamindb_setup import settings
from lamindb_setup import settings as setup_settings
from lamindb_setup.core import InstanceSettings
from lamindb_setup.core.types import UPathStr
from lnschema_core import Run, Transform, ids
from lnschema_core.types import TransformType
from lnschema_core.users import current_user_id

from lamindb.core._transform_settings import transform_settings

from ._settings import settings
from ._sync_git import get_transform_reference_from_git_repo

is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)

msg_path_failed = (
Expand Down Expand Up @@ -257,7 +262,7 @@ def _track(
>>> transform = ln.Transform.filter(name="Cell Ranger", version="2").one()
>>> ln.track(transform)
"""
cls.instance = settings.instance
cls.instance = setup_settings.instance
if transform is None:
is_tracked = False
transform_settings_are_set = (
Expand All @@ -274,23 +279,23 @@ def _track(
).one_or_none()
if is_run_from_ipython:
short_name, name, _ = cls._track_notebook(path=path)
transform_type = TransformType.notebook
transform_ref = None
transform_ref_type = None
else:
import inspect

frame = inspect.stack()[1]
module = inspect.getmodule(frame[0])
name = Path(module.__file__).name # type: ignore
short_name = name
transform_type = (
TransformType.notebook
if is_run_from_ipython
else TransformType.script
)
(
name,
short_name,
transform_ref,
transform_ref_type,
) = cls._track_script(path=path)
transform_type = TransformType.script
cls._create_or_load_transform(
stem_uid=stem_uid,
version=version,
name=name,
reference=reference,
transform_ref=transform_ref,
transform_ref_type=transform_ref_type,
transform_type=transform_type,
short_name=short_name,
transform=transform,
Expand Down Expand Up @@ -347,14 +352,28 @@ def _track(

track_environment(run)

# at this point, we have a transform can display its parents if there are any
parents = cls.transform.parents.all() if cls.transform is not None else []
if len(parents) > 0:
if len(parents) == 1:
logger.info(f" parent transform: {parents[0]}")
else:
parents_formatted = "\n - ".join([f"{parent}" for parent in parents])
logger.info(f" parent transforms:\n - {parents_formatted}")
return None

@classmethod
def _track_script(
cls,
*,
path: Optional[UPathStr],
) -> Tuple[str, str, str, str]:
if path is None:
import inspect

frame = inspect.stack()[2]
module = inspect.getmodule(frame[0])
path = Path(module.__file__)
name = path.name
short_name = name
reference = None
reference_type = None
if settings.sync_git_repo is not None:
reference = get_transform_reference_from_git_repo(path)
reference_type = "url"
return name, short_name, reference, reference_type

@classmethod
def _track_notebook(
Expand Down Expand Up @@ -416,7 +435,8 @@ def _create_or_load_transform(
stem_uid: str,
version: Optional[str],
name: str,
reference: Optional[str] = None,
transform_ref: Optional[str] = None,
transform_ref_type: Optional[str] = None,
short_name: Optional[str] = None,
transform_type: TransformType = None,
transform: Optional[Transform] = None,
Expand All @@ -429,7 +449,8 @@ def _create_or_load_transform(
version=version,
name=name,
short_name=short_name,
reference=reference,
reference=transform_ref,
reference_type=transform_ref_type,
type=transform_type,
)
transform.save()
Expand Down
27 changes: 24 additions & 3 deletions lamindb/core/_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,20 @@
)


def sanitize_git_repo_url(repo_url: str) -> str:
return repo_url.replace(".git", "")


class Settings:
"""Settings.

Directly use instance `lamindb.settings` rather than instantiating this
class yourself.
Use ``lamindb.settings`` instead of instantiating this class yourself.
"""

def __init__(self):
self._verbosity_int: int = 1 # success-level logging
self._verbosity_int: int = 1 # warning-level logging
logger.set_verbosity(self._verbosity_int)
self._sync_git_repo: Optional[str] = None

upon_artifact_create_if_hash_exists: Literal[
"warn_return_existing", "error", "warn_create_new"
Expand Down Expand Up @@ -89,6 +93,23 @@ def _storage_settings(self) -> ln_setup.dev.StorageSettings:
storage_settings = ln_setup.dev.StorageSettings(root=self._using_storage)
return storage_settings

@property
def sync_git_repo(self) -> Optional[str]:
"""Sync transforms with scripts in git repository.

Provide the full git repo URL.
"""
return self._sync_git_repo

@sync_git_repo.setter
def sync_git_repo(self, value) -> None:
"""Sync transforms with scripts in git repository.

Provide the full git repo URL.
"""
self._sync_git_repo = sanitize_git_repo_url(value)
assert self._sync_git_repo.startswith("https://")

@property
def storage(self) -> Union[Path, UPath]:
"""Default storage location (a path to its root).
Expand Down
108 changes: 108 additions & 0 deletions lamindb/core/_sync_git.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import subprocess
from pathlib import Path
from typing import Optional

from lamin_utils import logger
from lamindb_setup import settings as setup_settings
from lamindb_setup.core.hashing import hash_code

from ._settings import sanitize_git_repo_url, settings


def get_git_repo_from_remote() -> Path:
repo_url = settings.sync_git_repo
repo_dir = setup_settings.storage.cache_dir / repo_url.split("/")[-1]
if repo_dir.exists():
logger.warning(f"git repo {repo_dir} already exists locally")
return repo_dir
logger.important(f"cloning {repo_url} into {repo_dir}")
result = subprocess.run(
f"git clone --depth 10 {repo_url}.git",
shell=True,
capture_output=True,
cwd=setup_settings.storage.cache_dir,
)
if result.returncode != 0 or not repo_dir.exists():
raise RuntimeError(result.stderr.decode())
return repo_dir


def check_remote_git_url_matches_setting():
result = subprocess.run(
"git config --get remote.origin.url",
shell=True,
capture_output=True,
)
remote_url = sanitize_git_repo_url(result.stdout.decode().strip())
assert remote_url == settings.sync_git_repo


def get_git_commit_hash(
blob_hash: str, repo_dir: Optional[Path] = None
) -> Optional[str]:
command = f"git log --find-object={blob_hash} --pretty=format:%H"
result = subprocess.run(
command,
shell=True,
capture_output=True,
cwd=repo_dir,
)
commit_hash = result.stdout.decode()
if commit_hash == "" or result.returncode == 1:
return None
else:
assert len(commit_hash) == 40
return commit_hash


def get_filepath_within_git_repo(
commit_hash: str, blob_hash: str, repo_dir: Optional[Path]
) -> str:
# repo_dir might not point to the root of the
# the git repository because git log --find-object works
# from anywhere in the repo, hence, let's get the root
repo_root = (
subprocess.run(
"git rev-parse --show-toplevel",
shell=True,
capture_output=True,
cwd=repo_dir,
)
.stdout.decode()
.strip()
)
command = f"git ls-tree -r {commit_hash} | grep -E {blob_hash}"
result = subprocess.run(
command,
shell=True,
capture_output=True,
cwd=repo_root,
)
if result.returncode != 0 and result.stderr.decode() != "":
raise RuntimeError(f"{command}\n{result.stderr.decode()}")
if len(result.stdout.decode()) == 0:
raise RuntimeError(
f"Could not find path in git repo {settings.sync_git_repo} running:\n{command}"
f"\nin local clone: {repo_root}"
)
filepath = result.stdout.decode().split()[-1]
return filepath


def get_transform_reference_from_git_repo(path: Path) -> str:
blob_hash = hash_code(path).hexdigest()
commit_hash = get_git_commit_hash(blob_hash)
if commit_hash is not None:
logger.warning("found script in local repository")
check_remote_git_url_matches_setting()
repo_dir = None
else:
repo_dir = get_git_repo_from_remote()
commit_hash = get_git_commit_hash(blob_hash, repo_dir=repo_dir)
if commit_hash is None:
raise RuntimeError(
f"Did not find blob hash {blob_hash} of {path} in git repo {repo_dir}"
)
gitpath = get_filepath_within_git_repo(commit_hash, blob_hash, repo_dir)
reference = f"{settings.sync_git_repo}/blob/{commit_hash}/{gitpath}"
return reference
21 changes: 21 additions & 0 deletions tests/test_run_context.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import subprocess

import lamindb as ln
import pytest
from lamindb.core._run_context import get_uid_ext, run_context
Expand Down Expand Up @@ -85,3 +87,22 @@ def test_create_or_load_transform(monkeypatch):
"SystemExit: Please update your transform settings as follows"
in error.exconly()
)


def test_sync_git_repo():
ln.setup.settings.auto_connect = False
script_path = "sub/lamin-cli/tests/scripts/initialized.py"
result = subprocess.run(
f"python {script_path}",
shell=True,
capture_output=True,
)
assert result.returncode == 0
assert "saved: Transform" in result.stdout.decode()
assert "saved: Run" in result.stdout.decode()
transform = ln.Transform.filter(name="initialized.py").one()
assert (
transform.reference
== "https://github.com/laminlabs/lamin-cli/blob/39fb29b1b3ccc891a025b5a631d6294413b6ee45/tests/scripts/initialized.py"
)
assert transform.reference_type == "url"
4 changes: 2 additions & 2 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def test_is_new_version_of_versioned_transform():
ln.Transform(x=1)
assert (
error.exconly()
== "ValueError: Only name, short_name, version, type, is_new_version_of can be"
" passed, but you passed: {'x': 1}"
== "ValueError: Only name, short_name, version, type, is_new_version_of,"
" reference, reference_type can be passed, but you passed: {'x': 1}"
)

# test that reference transform cannot be deleted
Expand Down
Loading