From aa79efd411046906f14b2a2b0d95cdb279afe4d9 Mon Sep 17 00:00:00 2001 From: Joao Da Silva Date: Tue, 12 Mar 2024 23:44:43 +0100 Subject: [PATCH] feat: improve docker service --- poetry.lock | 80 ++++++- pyproject.toml | 1 + src/wanna/core/services/docker.py | 380 +++++++++++++++++++++--------- src/wanna/core/utils/env.py | 10 +- src/wanna/core/utils/gcp.py | 19 +- src/wanna/core/utils/io.py | 33 +++ 6 files changed, 384 insertions(+), 139 deletions(-) create mode 100644 src/wanna/core/utils/io.py diff --git a/poetry.lock b/poetry.lock index 4fa80eb..7b2d917 100644 --- a/poetry.lock +++ b/poetry.lock @@ -130,6 +130,17 @@ d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "bracex" +version = "2.4" +description = "Bash style brace expander." +optional = false +python-versions = ">=3.8" +files = [ + {file = "bracex-2.4-py3-none-any.whl", hash = "sha256:efdc71eff95eaff5e0f8cfebe7d01adf2c8637c8c92edaf63ef348c241a82418"}, + {file = "bracex-2.4.tar.gz", hash = "sha256:a27eaf1df42cf561fed58b7a8f3fdf129d1ea16a81e1fadd1d17989bc6384beb"}, +] + [[package]] name = "cachetools" version = "5.3.3" @@ -284,6 +295,24 @@ files = [ {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] +[[package]] +name = "cli-exit-tools" +version = "1.2.6" +description = "functions to exit an cli application properly" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "cli_exit_tools-1.2.6-py3-none-any.whl", hash = "sha256:b230a552266ec0e48a51da861fc03bd3a943458230551d8ab635256e797de746"}, + {file = "cli_exit_tools-1.2.6.tar.gz", hash = "sha256:e76d4b628f8a5bb6dbbfba8fb62d6124793960dba252e7e7a27897065fc487e1"}, +] + +[package.dependencies] +click = "*" +lib-detect-testenv = "*" + +[package.extras] +test = ["black", "codecov", "coloredlogs", "coverage", "flake8", "mypy", "pytest", "pytest-cov", "pytest-runner", "readme-renderer"] + [[package]] name = "click" version = "8.1.7" @@ -1230,6 +1259,27 @@ files = [ {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, ] +[[package]] +name = "igittigitt" +version = "2.1.4" +description = "A spec-compliant gitignore parser for Python" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "igittigitt-2.1.4-py3-none-any.whl", hash = "sha256:79c48999be429de95dd4829b312667a7150cbb02bdb1424dfe081316d737ec92"}, + {file = "igittigitt-2.1.4.tar.gz", hash = "sha256:9278d55de8852dca57fddb04a8a02edba6b4a9d9a5ac0fb6b2c32c919724807b"}, +] + +[package.dependencies] +attrs = "*" +cli-exit-tools = "*" +click = "*" +lib-detect-testenv = "*" +wcmatch = "*" + +[package.extras] +test = ["black", "codecov", "coloredlogs", "coverage", "flake8", "mypy", "pytest", "pytest-cov", "pytest-runner", "readme-renderer"] + [[package]] name = "importlib-metadata" version = "7.0.2" @@ -1373,6 +1423,20 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" [package.extras] adal = ["adal (>=1.0.2)"] +[[package]] +name = "lib-detect-testenv" +version = "2.0.8" +description = "detects if pytest or doctest or pyrunner on pycharm is running" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "lib_detect_testenv-2.0.8-py3-none-any.whl", hash = "sha256:86a2555d5919ba11f50226e99852b29eb6dfeee37228af77ae80114186a165b2"}, + {file = "lib_detect_testenv-2.0.8.tar.gz", hash = "sha256:96527b3114727e70e80f671c204a225ae6aaaf117983f8fa4f56e542b2368d43"}, +] + +[package.extras] +test = ["black", "codecov", "coloredlogs", "coverage", "flake8", "mypy", "pytest", "pytest-cov", "pytest-runner", "readme-renderer"] + [[package]] name = "log-symbols" version = "0.0.14" @@ -3143,6 +3207,20 @@ files = [ [package.extras] watchmedo = ["PyYAML (>=3.10)"] +[[package]] +name = "wcmatch" +version = "8.5.1" +description = "Wildcard/glob file name matcher." +optional = false +python-versions = ">=3.8" +files = [ + {file = "wcmatch-8.5.1-py3-none-any.whl", hash = "sha256:24c19cedc92bc9c9e27f39db4e1824d72f95bd2cea32b254a47a45b1a1b227ed"}, + {file = "wcmatch-8.5.1.tar.gz", hash = "sha256:c0088c7f6426cf6bf27e530e2b7b734031905f7e490475fd83c7c5008ab581b3"}, +] + +[package.dependencies] +bracex = ">=2.1.1" + [[package]] name = "websocket-client" version = "1.7.0" @@ -3218,4 +3296,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.12.0" -content-hash = "c929100e228497bce7e7c70db86ed6cea59e9425fddea326e8ef291870ae4a4c" +content-hash = "3081742d2e4e2e79528fd04d6ac9af99203487c552fad712c715f21ad7b87a52" diff --git a/pyproject.toml b/pyproject.toml index 89305fd..865a1a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ scandir = [ ] scantree = "0.0.1" pendulum = "^2.1.2" +igittigitt = "^2.1.4" [tool.poetry.group.samples.dependencies] google-cloud-pipeline-components = "^2.8.0" diff --git a/src/wanna/core/services/docker.py b/src/wanna/core/services/docker.py index 6b23d9a..db0e229 100644 --- a/src/wanna/core/services/docker.py +++ b/src/wanna/core/services/docker.py @@ -7,7 +7,6 @@ from dirhash import dirhash from google.api_core.client_options import ClientOptions from google.api_core.future.polling import DEFAULT_POLLING -from google.api_core.operation import Operation from google.cloud.devtools import cloudbuild_v1 from google.cloud.devtools.cloudbuild_v1.services.cloud_build import CloudBuildClient from google.cloud.devtools.cloudbuild_v1.types import ( @@ -17,6 +16,7 @@ Source, StorageSource, ) +from google.cloud.storage import Blob from google.protobuf.duration_pb2 import Duration # pylint: disable=no-name-in-module from python_on_whales import Image, docker @@ -37,9 +37,9 @@ from wanna.core.utils.env import cloud_build_access_allowed, gcp_access_allowed from wanna.core.utils.gcp import ( convert_project_id_to_project_number, - make_tarfile, upload_file_to_gcs, ) +from wanna.core.utils.io import tar_docker_context from wanna.core.utils.templates import render_template logger = get_logger(__name__) @@ -123,7 +123,7 @@ def _read_build_config( config_path: Returns: - DockerBuildConfigMode + """ if os.path.isfile(config_path): with open(config_path) as file: @@ -143,6 +143,31 @@ def _is_docker_client_active() -> bool: """ return docker.info().id is not None + @staticmethod + def _get_ignore_patterns(context_dir: Path) -> List[str]: + """ + Get the ignore patterns from .dockerignore file in the context_dir. + + Args: + context_dir: Path to the docker context directory where .dockerignore file is located + + Returns: + List of ignore patterns + + """ + docker_ignore = context_dir / ".dockerignore" + ignore = [] + + if docker_ignore.exists(): + with open(docker_ignore, "r") as f: + lines = f.readlines() + ignore += [ + ignore.rstrip() + for ignore in lines + if not ignore.startswith("#") and not ignore.strip() == "" + ] + return ignore + def _build_image( self, context_dir, @@ -150,56 +175,59 @@ def _build_image( tags: List[str], docker_image_ref: str, **build_args, - ) -> Union[Image, None]: - should_build = self._should_build_by_context_dir_checksum( - self.build_dir / docker_image_ref, context_dir + ) -> Optional[Image]: + """ + Build a docker image locally or in GCP Cloud Build. + + Args: + context_dir: Path to the directory with all necessary files for docker image build + file_path: Path to the Dockerfile + tags: List of tags for the image + docker_image_ref: Name of the image + build_args: Additional build arguments + + Returns: + Image object if the image was built locally, None otherwise + """ + + ignore_patterns = self._get_ignore_patterns(context_dir) + + # if the checksumdir of the context dir has changed, we should build the image + should_build = self.quick_mode or self._should_build_by_context_dir_checksum( + self.build_dir / docker_image_ref, context_dir, ignore_patterns ) - if should_build and not self.quick_mode: - if self.cloud_build: - logger.user_info( - text=f"Building {docker_image_ref} docker image in GCP Cloud build" - ) - op = self._build_image_on_gcp_cloud_build( - context_dir=context_dir, - file_path=file_path, - docker_image_ref=docker_image_ref, - tags=tags, - ) - build_id = op.metadata.build.id - base = "https://console.cloud.google.com/cloud-build/builds" - if self.cloud_build_workerpool: - link = ( - base - + f";region={self.cloud_build_workerpool_location}/{build_id}?project={self.project_number}" - ) - else: - link = base + f"/{build_id}?project={self.project_number}" - try: - op.result() - self._write_context_dir_checksum( - self.build_dir / docker_image_ref, context_dir - ) - except: - raise Exception(f"Build failed. Here is a link to the logs: {link}") - return None - else: - logger.user_info( - text=f"Building {docker_image_ref} docker image locally with {build_args}" - ) - image = docker.build( - context_dir, file=file_path, load=True, tags=tags, **build_args - ) - self._write_context_dir_checksum( - self.build_dir / docker_image_ref, context_dir - ) - return image # type: ignore - else: + # skip builds if we are in quick mode or the checksumdir of docker contex_dir has not changed + if not should_build: logger.user_info( text=f"Skipping build for context_dir={context_dir}, dockerfile={file_path} and image {tags[0]}" ) return None + if self.cloud_build: + logger.user_info( + text=f"Building {docker_image_ref} docker image in Cloud build" + ) + self._build_image_on_gcp_cloud_build( + context_dir=context_dir, + file_path=file_path, + docker_image_ref=docker_image_ref, + tags=tags, + ignore_patterns=ignore_patterns, + ) + return None + else: + logger.user_info( + text=f"Building {docker_image_ref} docker image locally with {build_args}" + ) + image = docker.build( + context_dir, file=file_path, load=True, tags=tags, **build_args + ) + self._write_context_dir_checksum( + self.build_dir / docker_image_ref, context_dir, ignore_patterns + ) + return image # type: ignore + def _pull_image(self, image_url: str) -> Union[Image, None]: if self.cloud_build or self.quick_mode: # TODO: verify that images exists remotely but dont pull them to local @@ -236,17 +264,21 @@ def get_image( ) -> Tuple[DockerImageModel, Optional[Image], str]: """ A wrapper around _get_image that checks if the docker image has been already build / pulled - and cached in image_store. + + Args: + docker_image_ref: Name of the image to get + + Returns: + Tuple of DockerImageModel, Image and image tag + """ - if docker_image_ref in self.image_store: - image = self.image_store.get(docker_image_ref) - return image # type: ignore - else: - image = self._get_image( - docker_image_ref=docker_image_ref, - ) + + image = self.image_store.get(docker_image_ref) + if not image: + image = self._get_image(docker_image_ref=docker_image_ref) self.image_store.update({docker_image_ref: image}) - return image + + return image def _get_image( self, @@ -256,8 +288,9 @@ def _get_image( Given the docker_image_ref, this function prepares the image for you. Depending on the build_type, it either build the docker image or if you work with provided_image type, it will pull the image to verify the url. + Args: - docker_image_ref: + docker_image_ref: Name of the image to get Returns: @@ -313,22 +346,31 @@ def _get_image( tags[0], ) - def _get_dirhash(self, directory: Path): - dockerignore = directory / ".dockerignore" - ignore = [] + @staticmethod + def _get_dirhash( + directory: Path, ignore_patterns: Optional[List[str]] = None + ) -> str: + """ + Get the checksum of the directory. - if dockerignore.exists(): - with open(dockerignore, "r") as f: - lines = f.readlines() - ignore += [ - ignore.rstrip() - for ignore in lines - if not ignore.startswith("#") and not ignore.strip() == "" - ] + Args: + directory: Path to the directory to be checksummed + ignore_patterns: List of patterns to ignore + + Returns: + Checksum of the directory + """ + return dirhash(directory, "sha256", ignore=set(ignore_patterns or [])) - return dirhash(directory, "sha256", ignore=set(ignore)) + def _get_cache_path(self, hash_cache_dir: Path) -> Path: + """ + Get the path to the cache file. - def _get_cache_path(self, hash_cache_dir: Path): + Args: + hash_cache_dir: Path to the directory where the cache file is stored + Returns: + Path to the cache file + """ version = kebabcase(self.version) os.makedirs(hash_cache_dir, exist_ok=True) return ( @@ -337,10 +379,21 @@ def _get_cache_path(self, hash_cache_dir: Path): ) def _should_build_by_context_dir_checksum( - self, hash_cache_dir: Path, context_dir: Path + self, hash_cache_dir: Path, context_dir: Path, ignore_patterns: List[str] ) -> bool: + """ + Check if the context_dir has changed since the last build. + + Args: + hash_cache_dir: Path to the directory where the cache file is stored + context_dir: Path to the directory to check + ignore_patterns: List of patterns to ignore + + Returns: + True if the context_dir has changed, False otherwise + """ cache_file = self._get_cache_path(hash_cache_dir) - sha256hash = self._get_dirhash(context_dir) + sha256hash = self._get_dirhash(context_dir, ignore_patterns) if cache_file.exists(): with open(cache_file, "r") as f: old_hash = f.read().replace("\n", "") @@ -348,15 +401,36 @@ def _should_build_by_context_dir_checksum( else: return True - def _write_context_dir_checksum(self, hash_cache_dir: Path, context_dir: Path): + def _write_context_dir_checksum( + self, + hash_cache_dir: Path, + context_dir: Path, + ignore_patterns: Optional[List[str]] = None, + ) -> None: + """ + Write the checksum of the context_dir to a file. + + Args: + hash_cache_dir: Path to the directory where the cache file is stored + context_dir: Path to the directory to check + ignore_patterns: List of patterns to ignore + Returns: + None + """ + cache_file = self._get_cache_path(hash_cache_dir) - sha256hash = self._get_dirhash(context_dir) + sha256hash = self._get_dirhash(context_dir, ignore_patterns) with open(cache_file, "w") as f: f.write(sha256hash) def _build_image_on_gcp_cloud_build( - self, context_dir: Path, file_path: Path, tags: List[str], docker_image_ref: str - ) -> Operation: + self, + context_dir: Path, + file_path: Path, + tags: List[str], + docker_image_ref: str, + ignore_patterns: Optional[List[str]] = None, + ) -> None: """ Build a docker container in GCP Cloud Build and push the images to registry. Folder context_dir is tarred, uploaded to GCS and then used to building. @@ -364,54 +438,56 @@ def _build_image_on_gcp_cloud_build( Args: context_dir: directory with all necessary files for docker image build file_path: path to Dockerfile - tags: + tags: list of tags for the image + docker_image_ref: Name of the image + ignore_patterns: List of patterns to ignore + + Returns: + None + """ + # Set the pooling timeout to self.cloud_build_timeout seconds + # since often large GPUs builds exceed the 900s limit + timeout = Duration() + timeout.seconds = self.cloud_build_timeout + DEFAULT_POLLING._timeout = self.cloud_build_timeout + dockerfile = os.path.relpath(file_path, context_dir) - tar_filename = self.work_dir / "build" / "docker" / f"{docker_image_ref}.tar.gz" - make_tarfile(context_dir, tar_filename) - blob_name = os.path.relpath(tar_filename, self.work_dir).replace("\\", "/") - blob = upload_file_to_gcs( - filename=tar_filename, bucket_name=self.bucket, blob_name=blob_name + blob = self._upload_context_dir_to_gcs( + context_dir, docker_image_ref, ignore_patterns ) - tags_args = " ".join([f"--destination={t}" for t in tags]).split() - steps = BuildStep( - name=f"gcr.io/kaniko-project/executor:{self.docker_model.cloud_build_kaniko_version}", - args=tags_args + tags_args = " ".join([f"--destination={t}" for t in tags]).split() + kaniko_build_args = ( + tags_args + self.docker_model.cloud_build_kaniko_flags - + ["--dockerfile", dockerfile], + + ["--dockerfile", dockerfile] ) - timeout = Duration() - timeout.seconds = self.cloud_build_timeout - - # Set the pooling timeout to self.cloud_build_timeout seconds - # since often large GPUs builds exceed the 900s limit - DEFAULT_POLLING._timeout = self.cloud_build_timeout - - if self.cloud_build_workerpool: - options = BuildOptions( - pool=BuildOptions.PoolOption( - name=f"projects/{self.project_number}/locations/{self.cloud_build_workerpool_location}" - f"/workerPools/{self.cloud_build_workerpool}" - ) - ) - api_endpoint = ( - f"{self.cloud_build_workerpool_location}-cloudbuild.googleapis.com" + options, api_endpoint = ( + (None, "cloudbuild.googleapis.com") + if not self.cloud_build_workerpool + else ( + BuildOptions( + pool=BuildOptions.PoolOption( + name=f"projects/{self.project_number}/locations/{self.cloud_build_workerpool_location}/workerPools/{self.cloud_build_workerpool}" + ) + ), + f"{self.cloud_build_workerpool_location}-cloudbuild.googleapis.com", ) - else: - options = None - api_endpoint = "cloudbuild.googleapis.com" + ) build = Build( source=Source( storage_source=StorageSource(bucket=blob.bucket.name, object_=blob.name) ), - steps=[steps], - # Issue with kaniko builder, images wont show in cloud build artifact column in UI - # https://github.com/GoogleCloudPlatform/cloud-builders-community/issues/212 - # images=tags, + steps=[ + BuildStep( + name=f"gcr.io/kaniko-project/executor:{self.docker_model.cloud_build_kaniko_version}", + args=kaniko_build_args, + ) + ], timeout=timeout, options=options, ) @@ -420,10 +496,27 @@ def _build_image_on_gcp_cloud_build( client_options=ClientOptions(api_endpoint=api_endpoint), ) request = cloudbuild_v1.CreateBuildRequest( - project_id=self.project_id, - build=build, + project_id=self.project_id, build=build + ) + + op = client.create_build(request=request) + build_id = op.metadata.build.id + base = "https://console.cloud.google.com/cloud-build/builds" + link = base + ( + f";region={self.cloud_build_workerpool_location}/{build_id}?project={self.project_number}" + if self.cloud_build_workerpool + else f"/{build_id}?project={self.project_number}" ) - return client.create_build(request=request) + + logger.user_info(text=f"Build started {link}") + + try: + op.result() + self._write_context_dir_checksum( + self.build_dir / docker_image_ref, context_dir, ignore_patterns + ) + except: + raise Exception(f"Build failed {link}") def push_image( self, image_or_tags: Union[Image, List[str]], quiet: bool = False @@ -431,10 +524,15 @@ def push_image( """ Push a docker image to the registry (image must have tags) If you are in the cloud_build mode, nothing is pushed, images already live in cloud. + Args: - :param image_or_tags: - :param quiet: + image_or_tags: what docker resource to push + quiet: If you don't want to see the progress bars. + + Returns: + None """ + if not self.cloud_build: tags = ( image_or_tags.repo_tags @@ -449,9 +547,13 @@ def push_image_ref( ) -> None: """ Push a docker image ref to the registry (image must have tags) + Args: image_ref: image_ref to push quiet: If you don't want to see the progress bars. + + Returns: + None """ model, image, tag = self.get_image(image_ref) if (image or tag) and model.build_type != ImageBuildType.provided_image: @@ -460,7 +562,16 @@ def push_image_ref( @staticmethod def remove_image(image: Image, force=False, prune=True) -> None: """ - Remove docker image, useful if you dont want to clutter your machine. + Remove docker image, useful if you don't want to clutter your machine. + + Args: + image: Image to remove + force: Force remove + prune: Prune + + Returns: + None + """ docker.image.remove(image, force=force, prune=prune) @@ -470,8 +581,9 @@ def construct_image_tag( ): """ Construct full image tag. + Args: - image_name: + image_name: name of the image Returns: List of full image tag @@ -486,6 +598,13 @@ def construct_image_tag( def build_container_and_get_image_url( self, docker_image_ref: str, push_mode: PushMode = PushMode.all ) -> str: + """ + Build a docker image and push it to the registry. + + :param docker_image_ref: + :param push_mode: + :return: + """ if push_mode == PushMode.quick: # only get image tag docker_image_model = self.find_image_model_by_name(docker_image_ref) @@ -527,7 +646,36 @@ def _jinja_render_dockerfile( ) else: raise Exception("Invalid docker image type.") + docker_file_path = build_dir / Path(f"{image_model.name}.Dockerfile") + with open(docker_file_path, "w") as file: file.write(rendered) + return docker_file_path + + def _upload_context_dir_to_gcs( + self, + context_dir: Path, + docker_image_ref: str, + ignore_patterns: Optional[List[str]] = None, + ) -> Blob: + """ + Tar the context_dir and upload it to GCS. + + Args: + context_dir: Path to the directory to be tarred and uploaded + docker_image_ref: Name of the image + ignore_patterns: List of patterns to ignore whilst tarring + + Returns: + Blob + """ + tar_filename = self.work_dir / "build" / "docker" / f"{docker_image_ref}.tar.gz" + tar_docker_context(context_dir, tar_filename, ignore_patterns or []) + + blob_name = os.path.relpath(tar_filename, self.work_dir).replace("\\", "/") + blob = upload_file_to_gcs( + filename=tar_filename, bucket_name=self.bucket, blob_name=blob_name + ) + return blob diff --git a/src/wanna/core/utils/env.py b/src/wanna/core/utils/env.py index fb92e22..8d08bd2 100644 --- a/src/wanna/core/utils/env.py +++ b/src/wanna/core/utils/env.py @@ -31,7 +31,7 @@ def _gcp_access_allowed(env_var="WANNA_GCP_ACCESS_ALLOWED"): bool if wanna can access GCP apis """ allowed = get_env_bool(os.environ.get(env_var), True) - logger.user_info(f"WANNA GCP access {'NOT ' if not allowed else ''}allowed") + logger.user_info(f"GCP access {'NOT ' if not allowed else ''}allowed") return allowed @@ -51,9 +51,9 @@ def _should_validate(env_var="WANNA_GCP_ENABLE_REMOTE_VALIDATION"): """ validate = get_env_bool(os.environ.get(env_var), True) if validate: - logger.user_info("WANNA remote validation is enabled") + logger.user_info("Remote validation is enabled") else: - logger.user_info("WANNA remote validation is disabled") + logger.user_info("Remote validation is disabled") return validate @@ -70,9 +70,9 @@ def _cloud_build_access_allowed(env_var="WANNA_GCP_CLOUD_BUILD_ACCESS_ALLOWED") """ allowed = get_env_bool(os.environ.get(env_var), True) if allowed: - logger.user_info("WANNA cloud build access is enabled") + logger.user_info("Cloud build access is enabled") else: - logger.user_info("WANNA cloud build access is disabled") + logger.user_info("Cloud build access is disabled") return allowed diff --git a/src/wanna/core/utils/gcp.py b/src/wanna/core/utils/gcp.py index 8e0b523..d2839e9 100644 --- a/src/wanna/core/utils/gcp.py +++ b/src/wanna/core/utils/gcp.py @@ -1,6 +1,4 @@ -import os import re -import tarfile from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -329,22 +327,9 @@ def construct_vm_image_family_from_vm_image( return f"{framework}-{version}-notebooks" -def make_tarfile(source_dir: Path, output_filename: Path): - """ - TAR a given folder and save the result to output_filename. - - Args: - source_dir: - output_filename: - """ - os.makedirs(output_filename.parent.absolute(), exist_ok=True) - with tarfile.open(output_filename, "w:gz") as tar: - tar.add(source_dir, arcname=".") - - def upload_file_to_gcs( filename: Path, bucket_name: str, blob_name: str -) -> "storage.blob.Blob": +) -> storage.blob.Blob: """ Upload file to GCS bucket @@ -365,7 +350,7 @@ def upload_file_to_gcs( def upload_string_to_gcs( data: str, bucket_name: str, blob_name: str -) -> "storage.blob.Blob": +) -> storage.blob.Blob: """ Upload a string to GCS bucket without saving it locally as a file. Args: diff --git a/src/wanna/core/utils/io.py b/src/wanna/core/utils/io.py new file mode 100644 index 0000000..77083a9 --- /dev/null +++ b/src/wanna/core/utils/io.py @@ -0,0 +1,33 @@ +import os +import tarfile +from pathlib import Path +from typing import List + +import igittigitt + + +def tar_docker_context( + source_dir: Path, target_tar_file: Path, ignore_patterns: List[str] = [] +): + """ + Tars a directory recursively while optionally skipping files based on ignore patterns. + + :param source_dir: Path to the directory to be tarred. + :param target_tar_file: Path to the output TAR file. + :param ignore_patterns: List of file patterns to skip (e.g., ['*.pyc', '*.log']). + """ + + parser = igittigitt.IgnoreParser() + for pattern in ignore_patterns: + parser.add_rule(pattern, source_dir) + + os.makedirs(target_tar_file.parent.absolute(), exist_ok=True) + with tarfile.open(target_tar_file, "w:gz") as the_tar_file: + for root, _, files in os.walk(source_dir): + for file in files: + file_path = os.path.join(root, file) + if parser.match(file_path): + continue + the_tar_file.add( + file_path, arcname=os.path.relpath(file_path, source_dir) + )