From 956d19549ee95cfe5087818aff9fd6a17846aff8 Mon Sep 17 00:00:00 2001 From: Alex Wolf Date: Tue, 12 Mar 2024 22:11:15 +0100 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lamindb/core/_run_context.py | 72 +-------------------------------- lamindb/core/_sync_git.py | 78 ++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 71 deletions(-) create mode 100644 lamindb/core/_sync_git.py diff --git a/lamindb/core/_run_context.py b/lamindb/core/_run_context.py index 308e34754..16d139ae7 100644 --- a/lamindb/core/_run_context.py +++ b/lamindb/core/_run_context.py @@ -11,7 +11,6 @@ from lamin_utils import logger from lamindb_setup import settings as setup_settings from lamindb_setup.core import InstanceSettings -from lamindb_setup.core.hashing import hash_code from lamindb_setup.core.types import UPathStr from lnschema_core import Run, Transform, ids from lnschema_core.types import TransformType @@ -20,6 +19,7 @@ from lamindb.core._transform_settings import transform_settings from ._settings import settings +from ._sync_git import get_transform_reference_from_git_repo is_run_from_ipython = getattr(builtins, "__IPYTHON__", False) @@ -204,76 +204,6 @@ def raise_transform_settings_error() -> None: ) -def clone_git_repo(git_url: str) -> None: - if not git_url.endswith(".git"): - git_url += ".git" - logger.important(f"cloning {git_url}") - result = subprocess.run( - f"git clone --depth 10 {git_url}", - shell=True, - capture_output=True, - ) - if result.returncode != 0: - raise RuntimeError(result.stderr.decode()) - - -def dir_from_repo_url(repo_url: Optional[str]) -> Optional[str]: - if repo_url is not None: - cd_repo = repo_url.split("/")[-1].replace(".git", "") - return cd_repo - - -def get_git_commit_hash( - blob_hash: str, cd_repo: Optional[str] -) -> subprocess.CompletedProcess: - return subprocess.run( - f"git log --find-object={blob_hash} --pretty=format:%H", - shell=True, - capture_output=True, - cwd=cd_repo, - ) - - -def get_filepath_within_git_repo( - commit_hash: str, blob_hash: str, cd_repo: Optional[str] -) -> str: - result = subprocess.run( - f"git ls-tree -r {commit_hash} | grep -E {blob_hash}", - shell=True, - capture_output=True, - cwd=cd_repo, - ) - if result.returncode != 0: - raise RuntimeError( - f"git ls-tree -r {commit_hash} | grep -E {blob_hash}\n" - + result.stderr.decode() - ) - if len(result.stdout.decode()) == 0: - raise RuntimeError("Could not find filepath within git repo.") - filepath = result.stdout.decode().split()[-1] - return filepath - - -def get_transform_reference_from_git_repo(path: Path): - blob_hash = hash_code(path).hexdigest() - cd_repo = None - result = get_git_commit_hash(blob_hash, cd_repo=None) - commit_hash = result.stdout.decode() - if commit_hash == "" or result.returncode == 1: - cd_repo = dir_from_repo_url(settings.sync_git_repo) - clone_git_repo(settings.sync_git_repo) - result = get_git_commit_hash(blob_hash, cd_repo=cd_repo) - commit_hash = result.stdout.decode() - if commit_hash == "" or result.returncode == 1: - raise RuntimeError( - f"Did not find file in git repo\n{result.stderr.decode()}" - ) - gitpath = get_filepath_within_git_repo(commit_hash, blob_hash, cd_repo) - reference = f"{settings.sync_git_repo}/blob/{commit_hash}/{gitpath}" - reference_type = "url" - return reference, reference_type - - class run_context: """Global run context.""" diff --git a/lamindb/core/_sync_git.py b/lamindb/core/_sync_git.py new file mode 100644 index 000000000..42c7fa209 --- /dev/null +++ b/lamindb/core/_sync_git.py @@ -0,0 +1,78 @@ +import subprocess +from pathlib import Path +from typing import Optional + +from lamin_utils import logger +from lamindb_setup.core.hashing import hash_code + +from ._settings import settings + + +def clone_git_repo(git_url: str) -> None: + if not git_url.endswith(".git"): + git_url += ".git" + logger.important(f"cloning {git_url}") + result = subprocess.run( + f"git clone --depth 10 {git_url}", + shell=True, + capture_output=True, + ) + if result.returncode != 0: + raise RuntimeError(result.stderr.decode()) + + +def dir_from_repo_url(repo_url: Optional[str]) -> Optional[str]: + if repo_url is not None: + cd_repo = repo_url.split("/")[-1].replace(".git", "") + return cd_repo + + +def get_git_commit_hash( + blob_hash: str, cd_repo: Optional[str] +) -> subprocess.CompletedProcess: + return subprocess.run( + f"git log --find-object={blob_hash} --pretty=format:%H", + shell=True, + capture_output=True, + cwd=cd_repo, + ) + + +def get_filepath_within_git_repo( + commit_hash: str, blob_hash: str, cd_repo: Optional[str] +) -> str: + result = subprocess.run( + f"git ls-tree -r {commit_hash} | grep -E {blob_hash}", + shell=True, + capture_output=True, + cwd=cd_repo, + ) + if result.returncode != 0: + raise RuntimeError( + f"git ls-tree -r {commit_hash} | grep -E {blob_hash}\n" + + result.stderr.decode() + ) + if len(result.stdout.decode()) == 0: + raise RuntimeError("Could not find filepath within git repo.") + filepath = result.stdout.decode().split()[-1] + return filepath + + +def get_transform_reference_from_git_repo(path: Path): + blob_hash = hash_code(path).hexdigest() + cd_repo = None + result = get_git_commit_hash(blob_hash, cd_repo=None) + commit_hash = result.stdout.decode() + if commit_hash == "" or result.returncode == 1: + cd_repo = dir_from_repo_url(settings.sync_git_repo) + clone_git_repo(settings.sync_git_repo) + result = get_git_commit_hash(blob_hash, cd_repo=cd_repo) + commit_hash = result.stdout.decode() + if commit_hash == "" or result.returncode == 1: + raise RuntimeError( + f"Did not find file in git repo\n{result.stderr.decode()}" + ) + gitpath = get_filepath_within_git_repo(commit_hash, blob_hash, cd_repo) + reference = f"{settings.sync_git_repo}/blob/{commit_hash}/{gitpath}" + reference_type = "url" + return reference, reference_type