From c4bc34f9ff31a9d4419be7a8e9804dd7341f0143 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 29 Aug 2024 20:34:00 -0700 Subject: [PATCH] feat(build): remove base-requirements.txt (#11238) Co-authored-by: David Leifker --- .github/actions/ci-optimization/action.yml | 3 +- .../docker-custom-build-and-push/action.yml | 81 ++-- .github/workflows/docker-unified.yml | 54 ++- docker/datahub-ingestion-base/Dockerfile | 12 +- .../base-requirements.txt | 385 ------------------ docker/datahub-ingestion-base/build.gradle | 2 +- .../regenerate-base-requirements.sh | 36 -- docker/datahub-ingestion/Dockerfile | 16 +- docker/datahub-ingestion/Dockerfile-slim-only | 9 +- docker/datahub-ingestion/build.gradle | 2 +- 10 files changed, 119 insertions(+), 481 deletions(-) delete mode 100644 docker/datahub-ingestion-base/base-requirements.txt delete mode 100755 docker/datahub-ingestion-base/regenerate-base-requirements.sh diff --git a/.github/actions/ci-optimization/action.yml b/.github/actions/ci-optimization/action.yml index ff901b5de04b65..0d435963382675 100644 --- a/.github/actions/ci-optimization/action.yml +++ b/.github/actions/ci-optimization/action.yml @@ -57,7 +57,8 @@ runs: - "metadata-ingestion-modules/**" - "metadata-ingestion/**" - "metadata-models/**" - - "docker/datahub-ingestion**" + - "docker/datahub-ingestion-base/**" + - "docker/datahub-ingestion/**" ingestion-base: - "docker/datahub-ingestion-base/**" docker: diff --git a/.github/actions/docker-custom-build-and-push/action.yml b/.github/actions/docker-custom-build-and-push/action.yml index 9da76541633338..ccaff510c120aa 100644 --- a/.github/actions/docker-custom-build-and-push/action.yml +++ b/.github/actions/docker-custom-build-and-push/action.yml @@ -31,16 +31,21 @@ inputs: description: "Main tag to use for the Docker image" required: true flavor: - description: 'Image flavor (e.g., slim, full)' + description: "Image flavor (e.g., slim, full)" required: false target: description: "Sets the target stage to build" required: false + depot-project: + # Setting this will use native arm64 docker builds instead of QEMU emulation. + # This speeds up builds by 2-3x. + description: "Depot project id" + required: false + outputs: image_tag: description: "Docker image tags" value: ${{ steps.docker_meta.outputs.tags }} - # image_name: ${{ env.DATAHUB_GMS_IMAGE }} runs: using: "composite" @@ -58,9 +63,22 @@ runs: type=raw,value=head,suffix=${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }},enable={{is_default_branch}} type=sha,prefix=,format=short,suffix=${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }} + - name: Single Tag + id: single_tag + shell: bash + run: | + IMAGES=""" + ${{ inputs.images }} + """ + TAGS=""" + ${{ inputs.image_tag }} + """ + echo "SINGLE_IMAGE=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> "$GITHUB_OUTPUT" + echo "SINGLE_TAG=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }'):$(echo $TAGS | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> "$GITHUB_OUTPUT" + # Code for testing the build when not pushing to Docker Hub. - name: Build and Load image for testing (if not publishing) - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 if: ${{ inputs.publish != 'true' }} with: context: ${{ inputs.context }} @@ -73,20 +91,11 @@ runs: target: ${{ inputs.target }} load: true push: false - cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} - cache-to: type=inline - - name: Single Tag - if: ${{ inputs.publish != 'true' }} - shell: bash - run: | - IMAGES=""" - ${{ inputs.images }} - """ - TAGS=""" - ${{ inputs.image_tag }} - """ - echo "SINGLE_TAG=$(echo $IMAGES | tr '\n' ' ' | awk -F' |,' '{ print $1 }'):$(echo $TAGS | tr '\n' ' ' | awk -F' |,' '{ print $1 }')" >> $GITHUB_OUTPUT - id: single_tag + cache-from: | + type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }} + type=registry,ref=${{ steps.docker_meta.outputs.tags }} + cache-to: | + type=inline - name: Upload image locally for testing (if not publishing) uses: ishworkh/docker-image-artifact-upload@v1 if: ${{ inputs.publish != 'true' }} @@ -96,19 +105,42 @@ runs: # Code for building multi-platform images and pushing to Docker Hub. - name: Set up QEMU uses: docker/setup-qemu-action@v3 - if: ${{ inputs.publish == 'true' }} + if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - if: ${{ inputs.publish == 'true' }} + if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }} + - name: Setup Depot CLI + uses: depot/setup-action@v1 + if: ${{ inputs.publish == 'true' && inputs.depot-project != '' }} - name: Login to DockerHub uses: docker/login-action@v3 if: ${{ inputs.publish == 'true' }} with: username: ${{ inputs.username }} password: ${{ inputs.password }} + + # Depot variant. - name: Build and Push Multi-Platform image - uses: docker/build-push-action@v5 - if: ${{ inputs.publish == 'true' }} + uses: depot/build-push-action@v1 + if: ${{ inputs.publish == 'true' && inputs.depot-project != '' }} + with: + project: ${{ inputs.depot-project }} + context: ${{ inputs.context }} + file: ${{ inputs.file }} + platforms: ${{ inputs.platforms }} + build-args: ${{ inputs.build-args }} + tags: ${{ steps.docker_meta.outputs.tags }} + target: ${{ inputs.target }} + push: true + cache-from: | + type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }} + type=registry,ref=${{ steps.docker_meta.outputs.tags }} + cache-to: | + type=inline + + - name: Build and Push Multi-Platform image + uses: docker/build-push-action@v6 + if: ${{ inputs.publish == 'true' && inputs.depot-project == '' }} with: context: ${{ inputs.context }} file: ${{ inputs.file }} @@ -117,7 +149,10 @@ runs: tags: ${{ steps.docker_meta.outputs.tags }} target: ${{ inputs.target }} push: true - cache-from: type=registry,ref=${{ steps.docker_meta.outputs.tags }} - cache-to: type=inline + cache-from: | + type=registry,ref=${{ steps.single_tag.outputs.SINGLE_IMAGE }}:head${{ inputs.flavor && format('-{0}', inputs.flavor) || '' }} + type=registry,ref=${{ steps.docker_meta.outputs.tags }} + cache-to: | + type=inline # TODO add code for vuln scanning? diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 2fe73a31eeb03e..c2adf628470e61 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -33,6 +33,10 @@ env: DATAHUB_INGESTION_BASE_IMAGE: "acryldata/datahub-ingestion-base" DATAHUB_INGESTION_IMAGE: "acryldata/datahub-ingestion" +permissions: + contents: read + id-token: write + jobs: setup: runs-on: ubuntu-latest @@ -68,23 +72,23 @@ jobs: id: tag run: | source .github/scripts/docker_helpers.sh - echo "short_sha=${SHORT_SHA}" >> $GITHUB_OUTPUT - echo "tag=$(get_tag)" >> $GITHUB_OUTPUT - echo "slim_tag=$(get_tag_slim)" >> $GITHUB_OUTPUT - echo "full_tag=$(get_tag_full)" >> $GITHUB_OUTPUT - echo "unique_tag=$(get_unique_tag)" >> $GITHUB_OUTPUT - echo "unique_slim_tag=$(get_unique_tag_slim)" >> $GITHUB_OUTPUT - echo "unique_full_tag=$(get_unique_tag_full)" >> $GITHUB_OUTPUT - echo "python_release_version=$(get_python_docker_release_v)" >> $GITHUB_OUTPUT - echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT - echo "repository_name=${GITHUB_REPOSITORY#*/}" >> $GITHUB_OUTPUT + echo "short_sha=${SHORT_SHA}" >> "$GITHUB_OUTPUT" + echo "tag=$(get_tag)" >> "$GITHUB_OUTPUT" + echo "slim_tag=$(get_tag_slim)" >> "$GITHUB_OUTPUT" + echo "full_tag=$(get_tag_full)" >> "$GITHUB_OUTPUT" + echo "unique_tag=$(get_unique_tag)" >> "$GITHUB_OUTPUT" + echo "unique_slim_tag=$(get_unique_tag_slim)" >> "$GITHUB_OUTPUT" + echo "unique_full_tag=$(get_unique_tag_full)" >> "$GITHUB_OUTPUT" + echo "python_release_version=$(get_python_docker_release_v)" >> "$GITHUB_OUTPUT" + echo "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> "$GITHUB_OUTPUT" + echo "repository_name=${GITHUB_REPOSITORY#*/}" >> "$GITHUB_OUTPUT" - name: Check whether docker login is possible id: docker-login env: ENABLE_DOCKER_LOGIN: ${{ secrets.ACRYL_DOCKER_PASSWORD != '' }} run: | echo "Enable Docker Login: ${{ env.ENABLE_DOCKER_LOGIN }}" - echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> $GITHUB_OUTPUT + echo "docker-login=${{ env.ENABLE_DOCKER_LOGIN }}" >> "$GITHUB_OUTPUT" - name: Check whether publishing enabled id: publish env: @@ -95,7 +99,7 @@ jobs: }} run: | echo "Enable publish: ${{ env.ENABLE_PUBLISH }}" - echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT + echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT" - name: Check whether PR publishing enabled id: pr-publish env: @@ -106,7 +110,7 @@ jobs: }} run: | echo "Enable PR publish: ${{ env.ENABLE_PUBLISH }}" - echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT + echo "publish=${{ env.ENABLE_PUBLISH }}" >> "$GITHUB_OUTPUT" - uses: ./.github/actions/ci-optimization id: ci-optimize - uses: actions/setup-python@v5 @@ -543,9 +547,10 @@ jobs: context: . file: ./docker/datahub-ingestion-base/Dockerfile platforms: linux/amd64,linux/arm64/v8 + depot-project: ${{ vars.DEPOT_PROJECT_ID }} - name: Compute DataHub Ingestion (Base) Tag id: tag - run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> "$GITHUB_OUTPUT" datahub_ingestion_base_slim_build: name: Build and Push DataHub Ingestion (Base-Slim) Docker Image runs-on: ubuntu-latest @@ -585,9 +590,10 @@ jobs: context: . file: ./docker/datahub-ingestion-base/Dockerfile platforms: linux/amd64,linux/arm64/v8 + depot-project: ${{ vars.DEPOT_PROJECT_ID }} - name: Compute DataHub Ingestion (Base-Slim) Tag id: tag - run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> "$GITHUB_OUTPUT" datahub_ingestion_base_full_build: name: Build and Push DataHub Ingestion (Base-Full) Docker Image runs-on: ubuntu-latest @@ -628,7 +634,7 @@ jobs: platforms: linux/amd64,linux/arm64/v8 - name: Compute DataHub Ingestion (Base-Full) Tag id: tag - run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ needs.setup.outputs.ingestion_base_change == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> "$GITHUB_OUTPUT" datahub_ingestion_slim_build: name: Build and Push DataHub Ingestion Docker Images @@ -681,9 +687,10 @@ jobs: context: . file: ./docker/datahub-ingestion/Dockerfile platforms: linux/amd64,linux/arm64/v8 + depot-project: ${{ vars.DEPOT_PROJECT_ID }} - name: Compute Tag id: tag - run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_slim_tag || 'head-slim' }}" >> "$GITHUB_OUTPUT" datahub_ingestion_slim_scan: permissions: contents: read # for actions/checkout to fetch code @@ -713,6 +720,7 @@ jobs: severity: "CRITICAL,HIGH" ignore-unfixed: true vuln-type: "os,library" + timeout: 15m - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v2 with: @@ -767,9 +775,10 @@ jobs: context: . file: ./docker/datahub-ingestion/Dockerfile platforms: linux/amd64,linux/arm64/v8 + depot-project: ${{ vars.DEPOT_PROJECT_ID }} - name: Compute Tag (Full) id: tag - run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> $GITHUB_OUTPUT + run: echo "tag=${{ needs.setup.outputs.ingestion_change == 'true' && needs.setup.outputs.unique_tag || 'head' }}" >> "$GITHUB_OUTPUT" datahub_ingestion_full_scan: permissions: contents: read # for actions/checkout to fetch code @@ -799,6 +808,7 @@ jobs: severity: "CRITICAL,HIGH" ignore-unfixed: true vuln-type: "os,library" + timeout: 15m - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v2 with: @@ -813,13 +823,13 @@ jobs: - id: set-matrix run: | if [ '${{ needs.setup.outputs.frontend_only }}' == 'true' ]; then - echo 'matrix=["cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT + echo 'matrix=["cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" elif [ '${{ needs.setup.outputs.ingestion_only }}' == 'true' ]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> $GITHUB_OUTPUT + echo 'matrix=["no_cypress_suite0","no_cypress_suite1"]' >> "$GITHUB_OUTPUT" elif [[ '${{ needs.setup.outputs.backend_change }}' == 'true' || '${{ needs.setup.outputs.smoke_test_change }}' == 'true' ]]; then - echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> $GITHUB_OUTPUT + echo 'matrix=["no_cypress_suite0","no_cypress_suite1","cypress_suite1","cypress_rest"]' >> "$GITHUB_OUTPUT" else - echo 'matrix=[]' >> $GITHUB_OUTPUT + echo 'matrix=[]' >> "$GITHUB_OUTPUT" fi smoke_test: diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index e3b0a33bb519a7..db1717ab59d163 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -53,9 +53,8 @@ RUN apt-get update && apt-get upgrade -y \ && apt-get clean \ && rm -rf /var/lib/{apt,dpkg,cache,log}/ -COPY --from=dockerize-binary /usr/local/bin/dockerize /usr/local/bin +COPY --from=powerman/dockerize:0.19 /usr/local/bin/dockerize /usr/local/bin -COPY ./docker/datahub-ingestion-base/base-requirements.txt requirements.txt COPY ./docker/datahub-ingestion-base/entrypoint.sh /entrypoint.sh RUN addgroup --gid 1000 datahub && \ @@ -67,7 +66,14 @@ ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt ENV VIRTUAL_ENV=/datahub-ingestion/.venv ENV PATH="${VIRTUAL_ENV}/bin:$PATH" RUN python3 -m venv $VIRTUAL_ENV && \ - uv pip install --no-cache -r requirements.txt + uv pip install --no-cache --upgrade pip setuptools wheel + +# Note: Normally uv will create hardlinks from the cache directory to the venv. +# In our docker files, we normally use `RUN --mount=type=cache,... uv pip install ...`, +# which means the cache directory is on a separate filesystem. uv will emit a warning: +# Failed to hardlink files; falling back to full copy. This may lead to degraded performance. +# If the cache and target directories are on different filesystems, hardlinking may not be supported. +# If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning. ENTRYPOINT [ "/entrypoint.sh" ] diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt deleted file mode 100644 index fa07b4184a6bc0..00000000000000 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ /dev/null @@ -1,385 +0,0 @@ -# Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate. -acryl-datahub-classify==0.0.11 -acryl-PyHive==0.6.16 -acryl-sqlglot==25.3.1.dev3 -aenum==3.1.15 -aiohappyeyeballs==2.3.2 -aiohttp==3.10.0 -aiosignal==1.3.1 -alembic==1.13.2 -altair==4.2.0 -anyio==4.4.0 -apache-airflow==2.9.3 -apache-airflow-providers-common-io==1.3.2 -apache-airflow-providers-common-sql==1.14.2 -apache-airflow-providers-fab==1.2.2 -apache-airflow-providers-ftp==3.10.0 -apache-airflow-providers-http==4.12.0 -apache-airflow-providers-imap==3.6.1 -apache-airflow-providers-smtp==1.7.1 -apache-airflow-providers-sqlite==3.8.1 -apispec==6.6.1 -appnope==0.1.4 -argcomplete==3.4.0 -argon2-cffi==23.1.0 -argon2-cffi-bindings==21.2.0 -asgiref==3.8.1 -asn1crypto==1.5.1 -asttokens==2.4.1 -async-timeout==4.0.3 -asynch==0.2.4 -attrs==23.2.0 -avro==1.11.3 -avro-gen3==0.7.13 -azure-common==1.1.28 -azure-core==1.29.4 -azure-identity==1.14.1 -azure-storage-blob==12.21.0 -azure-storage-file-datalake==12.16.0 -Babel==2.15.0 -backoff==2.2.1 -beautifulsoup4==4.12.3 -bleach==6.1.0 -blinker==1.8.2 -blis==0.7.11 -boto3==1.34.151 -botocore==1.34.151 -bracex==2.4 -cached-property==1.5.2 -cachelib==0.9.0 -cachetools==5.4.0 -catalogue==2.0.10 -cattrs==23.2.3 -certifi==2024.7.4 -cffi==1.16.0 -chardet==5.2.0 -charset-normalizer==3.3.2 -ciso8601==2.3.1 -click==8.1.7 -click-default-group==1.2.4 -click-spinner==0.1.10 -clickclick==20.10.2 -clickhouse-driver==0.2.8 -clickhouse-sqlalchemy==0.2.4 -cloudpathlib==0.18.1 -cloudpickle==3.0.0 -colorama==0.4.6 -colorlog==4.8.0 -comm==0.2.2 -confection==0.1.5 -ConfigUpdater==3.2 -confluent-kafka==2.5.0 -connexion==2.14.2 -cron-descriptor==1.4.3 -croniter==3.0.3 -cryptography==42.0.8 -cx_Oracle==8.3.0 -cymem==2.0.8 -databricks-dbapi==0.6.0 -databricks-sdk==0.29.0 -databricks-sql-connector==2.9.6 -dataflows-tabulator==1.54.3 -db-dtypes==1.2.0 -debugpy==1.8.2 -decorator==5.1.1 -defusedxml==0.7.1 -deltalake==0.17.4 -Deprecated==1.2.14 -dill==0.3.8 -dnspython==2.6.1 -docker==7.1.0 -docutils==0.21.2 -ecdsa==0.19.0 -elasticsearch==7.13.4 -email_validator==2.2.0 -entrypoints==0.4 -et-xmlfile==1.1.0 -exceptiongroup==1.2.2 -executing==2.0.1 -expandvars==0.12.0 -fastavro==1.9.5 -fastjsonschema==2.20.0 -filelock==3.15.4 -Flask==2.2.5 -flatdict==4.0.1 -frozenlist==1.4.1 -fsspec==2023.12.2 -future==1.0.0 -GeoAlchemy2==0.15.2 -gitdb==4.0.11 -GitPython==3.1.43 -google-api-core==2.19.1 -google-auth==2.32.0 -google-cloud-appengine-logging==1.4.5 -google-cloud-audit-log==0.2.5 -google-cloud-bigquery==3.25.0 -google-cloud-core==2.4.1 -google-cloud-datacatalog==3.20.0 -google-cloud-datacatalog-lineage==0.2.2 -google-cloud-logging==3.5.0 -google-crc32c==1.5.0 -google-re2==1.1.20240702 -google-resumable-media==2.7.1 -googleapis-common-protos==1.63.2 -gql==3.5.0 -graphql-core==3.2.3 -great-expectations==0.15.50 -greenlet==3.0.3 -grpc-google-iam-v1==0.13.1 -grpcio==1.65.2 -grpcio-status==1.62.2 -grpcio-tools==1.62.2 -gssapi==1.8.3 -gunicorn==22.0.0 -h11==0.14.0 -httpcore==1.0.5 -httpx==0.27.0 -humanfriendly==10.0 -idna==3.7 -ijson==3.3.0 -importlib_metadata==7.2.1 -importlib_resources==6.4.0 -inflection==0.5.1 -ipaddress==1.0.23 -ipykernel==6.17.1 -ipython==8.21.0 -ipython-genutils==0.2.0 -ipywidgets==8.1.3 -isodate==0.6.1 -itsdangerous==2.2.0 -jedi==0.19.1 -Jinja2==3.1.4 -jmespath==1.0.1 -JPype1==1.5.0 -jsonlines==4.0.0 -jsonpatch==1.33 -jsonpointer==3.0.0 -jsonref==1.1.0 -jsonschema==4.23.0 -jsonschema-specifications==2023.12.1 -jupyter-server==1.16.0 -jupyter_client==7.4.9 -jupyter_core==4.12.0 -jupyterlab_pygments==0.3.0 -jupyterlab_widgets==3.0.11 -langcodes==3.4.0 -language_data==1.2.0 -lark==1.1.4 -lazy-object-proxy==1.10.0 -leb128==1.0.8 -limits==3.13.0 -linear-tsv==1.1.0 -linkify-it-py==2.0.3 -lkml==1.3.5 -lockfile==0.12.2 -looker-sdk==23.0.0 -lxml==5.2.2 -lz4==4.3.3 -makefun==1.15.4 -Mako==1.3.5 -marisa-trie==1.2.0 -markdown-it-py==3.0.0 -MarkupSafe==2.1.5 -marshmallow==3.21.3 -marshmallow-oneofschema==3.1.1 -marshmallow-sqlalchemy==0.28.2 -matplotlib-inline==0.1.7 -mdit-py-plugins==0.4.1 -mdurl==0.1.2 -methodtools==0.4.7 -mistune==3.0.2 -mixpanel==4.10.1 -mlflow-skinny==2.15.0 -mmhash3==3.0.1 -more-itertools==10.3.0 -moto==4.2.14 -msal==1.22.0 -msal-extensions==1.1.0 -multidict==6.0.5 -murmurhash==1.0.10 -mypy-extensions==1.0.0 -nbclassic==1.1.0 -nbclient==0.6.3 -nbconvert==7.16.4 -nbformat==5.10.4 -nest-asyncio==1.6.0 -networkx==3.3 -notebook==6.5.7 -notebook_shim==0.2.4 -numpy==1.26.4 -oauthlib==3.2.2 -okta==1.7.0 -openlineage-airflow==1.18.0 -openlineage-integration-common==1.18.0 -openlineage-python==1.18.0 -openlineage_sql==1.18.0 -openpyxl==3.1.5 -opentelemetry-api==1.26.0 -opentelemetry-exporter-otlp==1.26.0 -opentelemetry-exporter-otlp-proto-common==1.26.0 -opentelemetry-exporter-otlp-proto-grpc==1.26.0 -opentelemetry-exporter-otlp-proto-http==1.26.0 -opentelemetry-proto==1.26.0 -opentelemetry-sdk==1.26.0 -opentelemetry-semantic-conventions==0.47b0 -ordered-set==4.1.0 -packaging==24.1 -pandas==2.1.4 -pandocfilters==1.5.1 -parse==1.20.2 -parso==0.8.4 -pathspec==0.12.1 -pendulum==3.0.0 -pexpect==4.9.0 -phonenumbers==8.13.0 -platformdirs==4.2.2 -pluggy==1.5.0 -portalocker==2.10.1 -preshed==3.0.9 -prison==0.2.1 -progressbar2==4.4.2 -prometheus_client==0.20.0 -prompt_toolkit==3.0.47 -proto-plus==1.24.0 -protobuf==4.25.4 -psutil==6.0.0 -psycopg2-binary==2.9.9 -ptyprocess==0.7.0 -pure-sasl==0.6.2 -pure_eval==0.2.3 -py-partiql-parser==0.5.0 -pyarrow==17.0.0 -pyarrow-hotfix==0.6 -pyasn1==0.6.0 -pyasn1_modules==0.4.0 -pyathena==2.25.2 -pycountry==24.6.1 -pycparser==2.22 -pycryptodome==3.20.0 -pydantic==1.10.17 -pydash==8.0.3 -pydruid==0.6.9 -Pygments==2.18.0 -pyiceberg==0.4.0 -pymongo==4.8.0 -PyMySQL==1.1.1 -pyOpenSSL==24.2.1 -pyparsing==3.0.9 -pyspnego==0.11.1 -python-daemon==3.0.1 -python-dateutil==2.9.0.post0 -python-jose==3.3.0 -python-ldap==3.4.4 -python-liquid==1.12.1 -python-nvd3==0.16.0 -python-slugify==8.0.4 -python-stdnum==1.20 -python-tds==1.15.0 -python-utils==3.8.2 -pytz==2024.1 -PyYAML==6.0.1 -pyzmq==26.0.3 -redash-toolbelt==0.1.9 -redshift-connector==2.1.2 -referencing==0.35.1 -regex==2024.7.24 -requests==2.32.3 -requests-file==2.1.0 -requests-gssapi==1.3.0 -requests-toolbelt==1.0.0 -requests_ntlm==1.3.0 -responses==0.25.3 -rfc3339-validator==0.1.4 -rfc3986==2.0.0 -rich==13.7.1 -rich-argparse==1.5.2 -rpds-py==0.19.1 -rsa==4.9 -rstr==3.2.2 -ruamel.yaml==0.17.17 -s3transfer==0.10.2 -schwifty==2024.6.1 -scipy==1.14.0 -scramp==1.4.5 -Send2Trash==1.8.3 -sentry-sdk==2.12.0 -setproctitle==1.3.3 -shellingham==1.5.4 -simple-salesforce==1.12.6 -six==1.16.0 -slack-sdk==3.18.1 -smart-open==7.0.4 -smmap==5.0.1 -sniffio==1.3.1 -snowflake-connector-python==3.12.0 -snowflake-sqlalchemy==1.6.1 -sortedcontainers==2.4.0 -soupsieve==2.5 -spacy==3.7.5 -spacy-legacy==3.0.12 -spacy-loggers==1.0.5 -sql_metadata==2.12.0 -SQLAlchemy==1.4.44 -sqlalchemy-bigquery==1.11.0 -sqlalchemy-cockroachdb==1.4.4 -SQLAlchemy-JSONField==1.0.2 -sqlalchemy-pytds==0.3.5 -sqlalchemy-redshift==0.8.14 -SQLAlchemy-Utils==0.41.2 -sqlglotrs==0.2.7 -sqllineage==1.3.8 -sqlparse==0.4.4 -srsly==2.4.8 -stack-data==0.6.3 -strictyaml==1.7.3 -tableauserverclient==0.25 -tableschema==1.20.11 -tabulate==0.9.0 -tenacity==9.0.0 -teradatasql==20.0.0.14 -teradatasqlalchemy==20.0.0.1 -termcolor==2.4.0 -terminado==0.18.1 -text-unidecode==1.3 -thinc==8.2.5 -thrift==0.16.0 -thrift-sasl==0.4.3 -time-machine==2.14.2 -tinycss2==1.3.0 -toml==0.10.2 -tomlkit==0.13.0 -toolz==0.12.1 -tornado==6.4.1 -tqdm==4.66.4 -traitlets==5.2.1.post0 -trino==0.329.0 -typer==0.12.3 -typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2024.1 -tzlocal==5.2 -uc-micro-py==1.0.3 -ujson==5.10.0 -unicodecsv==0.14.1 -universal_pathlib==0.2.2 -urllib3==1.26.19 -vertica-python==1.4.0 -vertica-sqlalchemy-dialect==0.0.8.2 -vininfo==1.8.0 -wasabi==1.1.3 -wcmatch==8.5.2 -wcwidth==0.2.13 -weasel==0.4.1 -webencodings==0.5.1 -websocket-client==1.8.0 -Werkzeug==2.2.3 -widgetsnbextension==4.0.11 -wirerope==0.4.7 -wrapt==1.16.0 -WTForms==3.1.2 -xlrd==2.0.1 -xmltodict==0.13.0 -yarl==1.9.4 -zeep==4.2.1 -zipp==3.19.2 -zstd==1.5.5.1 diff --git a/docker/datahub-ingestion-base/build.gradle b/docker/datahub-ingestion-base/build.gradle index 80719665f088bc..ef482de9256a33 100644 --- a/docker/datahub-ingestion-base/build.gradle +++ b/docker/datahub-ingestion-base/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 6 // increment to trigger rebuild + revision = 7 // increment to trigger rebuild } docker { diff --git a/docker/datahub-ingestion-base/regenerate-base-requirements.sh b/docker/datahub-ingestion-base/regenerate-base-requirements.sh deleted file mode 100755 index 13d74922d9013b..00000000000000 --- a/docker/datahub-ingestion-base/regenerate-base-requirements.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -# This script is used to regenerate the base-requirements.txt file - -set -euxo pipefail -cd "$( dirname "${BASH_SOURCE[0]}" )" - -SCRIPT_NAME=$(basename "$0") -DATAHUB_DIR=$(pwd)/../.. - -# Create a virtualenv. -VENV_DIR=$(mktemp -d) -python -c "import sys; assert sys.version_info >= (3, 9), 'Python 3.9 or higher is required.'" -python -m venv $VENV_DIR -source $VENV_DIR/bin/activate -pip install --upgrade pip uv setuptools wheel -echo "Using virtualenv at $VENV_DIR" - -# Install stuff. -pushd $DATAHUB_DIR/metadata-ingestion -uv pip install -e '.[all]' -e '../metadata-ingestion-modules/airflow-plugin/[plugin-v2]' -popd - -# Generate the requirements file. -# Removing Flask deps due as per https://github.com/datahub-project/datahub/pull/6867/files -# Removing py4j and PyJWT due to https://github.com/datahub-project/datahub/pull/6868/files -# Removing pyspark and pydeequ because we don't want them in the slim image, so they can be added separately. -# TODO: It's unclear if these removals are still actually needed. -echo "# Generated requirements file. Run ./$SCRIPT_NAME to regenerate." > base-requirements.txt -pip freeze \ - | grep -v -E "^-e" \ - | grep -v -E "^uv==" \ - | grep -v "Flask-" \ - | grep -v -E "(py4j|PyJWT)==" \ - | grep -v -E "(pyspark|pydeequ)==" \ - >> base-requirements.txt diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 1a1e57f42c76f4..ee0333e1cb1d1f 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -21,11 +21,11 @@ ARG PIP_MIRROR_URL RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi ENV UV_INDEX_URL=${PIP_MIRROR_URL} -COPY --chown=datahub ./metadata-ingestion /datahub-ingestion -COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin +COPY --chown=datahub ./metadata-ingestion /metadata-ingestion +COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /metadata-ingestion/airflow-plugin ARG RELEASE_VERSION -WORKDIR /datahub-ingestion +WORKDIR /metadata-ingestion RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \ sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" airflow-plugin/src/datahub_airflow_plugin/__init__.py && \ cat src/datahub/__init__.py | grep __version__ && \ @@ -33,7 +33,8 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS FROM base AS slim-install -RUN uv pip install --no-cache -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" +RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \ + UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" FROM base AS full-install-build @@ -43,9 +44,10 @@ RUN apt-get update && apt-get install -y -qq maven USER datahub COPY ./docker/datahub-ingestion/pyspark_jars.sh . -RUN uv pip install --no-cache -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \ +RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \ + UV_LINK_MODE=copy uv pip install -e ".[base,all]" "./airflow-plugin[plugin-v2]" && \ + ./pyspark_jars.sh && \ datahub --version -RUN ./pyspark_jars.sh FROM base AS full-install @@ -57,4 +59,6 @@ FROM base AS dev-install FROM ${APP_ENV}-install AS final +WORKDIR /datahub-ingestion + USER datahub diff --git a/docker/datahub-ingestion/Dockerfile-slim-only b/docker/datahub-ingestion/Dockerfile-slim-only index f17c8df63ae9d3..6ade262f2feded 100644 --- a/docker/datahub-ingestion/Dockerfile-slim-only +++ b/docker/datahub-ingestion/Dockerfile-slim-only @@ -11,18 +11,21 @@ ARG PIP_MIRROR_URL RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi ENV UV_INDEX_URL=${PIP_MIRROR_URL} -COPY --chown=datahub ./metadata-ingestion /datahub-ingestion +COPY --chown=datahub ./metadata-ingestion /metadata-ingestion ARG RELEASE_VERSION -WORKDIR /datahub-ingestion +WORKDIR /metadata-ingestion RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \ cat src/datahub/__init__.py FROM base as slim-install -RUN uv pip install --no-cache -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \ +RUN --mount=type=cache,target=/datahub-ingestion/.cache/uv,uid=1000,gid=1000 \ + UV_LINK_MODE=copy uv pip install -e ".[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary]" && \ datahub --version FROM slim-install as final +WORKDIR /datahub-ingestion + USER datahub diff --git a/docker/datahub-ingestion/build.gradle b/docker/datahub-ingestion/build.gradle index c6a3d93e5c4605..113a6dcf0a1bd4 100644 --- a/docker/datahub-ingestion/build.gradle +++ b/docker/datahub-ingestion/build.gradle @@ -12,7 +12,7 @@ ext { docker_target = project.getProperties().getOrDefault("dockerTarget", "slim") docker_version = "${version}${docker_target == 'slim' ? '-slim' : ''}" - revision = 6 // increment to trigger rebuild + revision = 8 // increment to trigger rebuild } dependencies {