diff --git a/.github/container/bump.sh b/.github/container/bump.sh index e1c143132..ea702b481 100755 --- a/.github/container/bump.sh +++ b/.github/container/bump.sh @@ -4,17 +4,13 @@ usage() { cat < /dev/null && pwd ) BASE_PATCH_DIR=${BASE_PATCH_DIR:-$SCRIPT_DIR/patches} -SKIP_BUMP_REFS=${SKIP_BUMP_REFS:-0} if [[ -z "${MANIFEST_IN:-}" ]]; then echo "Need to provide a value for -i/--input-manifest" @@ -84,19 +75,8 @@ else fi for pkg in $(yq e 'keys | .[]' $MANIFEST_OUT); do - mode=$(yq e ".${pkg}.mode" $MANIFEST_OUT) - if [[ $mode == git-clone || $mode == pip-vcs ]] && [[ $SKIP_BUMP_REFS -eq 0 ]]; then - url=$(yq e ".${pkg}.url" $MANIFEST_OUT) - tracking_ref=$(yq e ".${pkg}.tracking_ref" $MANIFEST_OUT) - if ! new_ref=$(git ls-remote --exit-code $url $tracking_ref | awk '{print $1}'); then - echo "Could not fetch $tracking_ref from $url" - exit 1 - fi - yq e ".${pkg}.latest_verified_commit = \"$new_ref\"" -i $MANIFEST_OUT - fi - has_patches=$(yq e ".${pkg} | has(\"patches\")" $MANIFEST_OUT) - if [[ $mode == git-clone && $has_patches == "true" ]]; then + if [[ $has_patches == "true" ]]; then url=$(yq e ".${pkg}.url" $MANIFEST_OUT) repo_tmp=$(mktemp -d /tmp/${pkg}.XXXXXX) git clone $url $repo_tmp diff --git a/.github/container/create-distribution.sh b/.github/container/create-distribution.sh index 46b283073..3b45597f7 100755 --- a/.github/container/create-distribution.sh +++ b/.github/container/create-distribution.sh @@ -157,7 +157,7 @@ done git fetch origin $TRACKING_REF # previous-HEAD's purpose is to point to the state of the repo before any distribution changes are made -# We do not rely on the manifest.yaml's .${library}.latest_verified_commit because local commits may be made on top by the upstream docker builds +# We do not rely on the manifest.yaml's .${library}.commit because local commits may be made on top by the upstream docker builds if ! git rev-parse --verify previous-HEAD >/dev/null 2>&1; then echo "[INFO]: Basing distribution on HEAD ($(git rev-parse HEAD)) and marking that with the local branch: previous-HEAD" git branch --force previous-HEAD HEAD diff --git a/.github/container/git-clone.sh b/.github/container/git-clone.sh index f4ddbc7fb..706fd926b 100755 --- a/.github/container/git-clone.sh +++ b/.github/container/git-clone.sh @@ -84,4 +84,4 @@ popd mkdir -p $(dirname ${MANIFEST}) touch ${MANIFEST} PACKAGE=$(basename "${DESTINATION}") -yq eval --inplace ". += {\"${PACKAGE}\": {\"url\": \"${GIT_REPO}\", \"tracking_ref\": \"${GIT_REF}\", \"latest_verified_commit\": \"${COMMIT_SHA}\", \"mode\": \"git-clone\"}}" ${MANIFEST} +yq eval --inplace ". += {\"${PACKAGE}\": {\"url\": \"${GIT_REPO}\", \"tracking_ref\": \"${GIT_REF}\", \"commit\": \"${COMMIT_SHA}\", \"mode\": \"git-clone\"}}" ${MANIFEST} diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index 2e58815c7..756dc8d5a 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -1,32 +1,14 @@ -jax: - url: https://github.com/google/jax.git - tracking_ref: main - latest_verified_commit: 8f4658ecdbe40cde0c43a9ab127359347943c076 - mode: git-clone -xla: - url: https://github.com/openxla/xla.git - tracking_ref: main - latest_verified_commit: 6de79c2c931374dfa7e403c0626d8dd1ec0ed398 - mode: git-clone flax: url: https://github.com/google/flax.git mirror_url: https://github.com/nvjax-svc-0/flax.git tracking_ref: main - latest_verified_commit: 718aa8ccb12c3fdefcf3d196874e4fc667b3ade5 - mode: git-clone + # Patches are only used for rosetta t5x patches: pull/3340/head: file://patches/flax/PR-3340.patch # Add Sharding Annotations to Flax Modules -transformer-engine: - url: https://github.com/NVIDIA/TransformerEngine.git - tracking_ref: main - latest_verified_commit: 1ec33ae1191ae6644365155f8e8f618145c44cd7 - mode: git-clone t5x: url: https://github.com/google-research/t5x.git mirror_url: https://github.com/nvjax-svc-0/t5x.git tracking_ref: main - latest_verified_commit: 707995a3a8238e0c3557d3cc1318a883215c54c9 - mode: git-clone patches: mirror/patch/partial-checkpoint-restore: file://patches/t5x/mirror-patch-partial-checkpoint-restore.patch # pull/1392/head # https://github.com/google-research/t5x/pull/1392: Add support for partial checkpoint restore mirror/patch/dali-support: file://patches/t5x/mirror-patch-dali-support.patch # pull/1393/head # https://github.com/google-research/t5x/pull/1393: Adds DALI support to t5x @@ -35,124 +17,12 @@ paxml: url: https://github.com/google/paxml.git mirror_url: https://github.com/nvjax-svc-0/paxml.git tracking_ref: main - latest_verified_commit: 051795784f8ddaba57eb51218addb5f1db8e04f4 - mode: git-clone patches: pull/46/head: file://patches/paxml/PR-46.patch # adds Transformer Engine support praxis: url: https://github.com/google/praxis.git mirror_url: https://github.com/nvjax-svc-0/praxis.git tracking_ref: main - latest_verified_commit: c58bcc4e82c80489a7f8a2c3366e7f6b32d271d4 - mode: git-clone patches: pull/27/head: file://patches/praxis/PR-27.patch # This PR allows XLA:GPU to detect the MHA pattern more easily to call fused kernels from cublas. pull/36/head: file://patches/praxis/PR-36.patch # adds Transformer Engine support -lingvo: - # Used only in ARM pax builds - url: https://github.com/tensorflow/lingvo.git - tracking_ref: master - latest_verified_commit: 05a076b0783a8bbf4a770095966c472bb37bbf65 - mode: git-clone -tensorflow-text: - # Used only in ARM pax and t5x builds - url: https://github.com/tensorflow/text.git - tracking_ref: v2.13.0 - latest_verified_commit: 917a681d7220ebf9b62a08b6f9ce7b7db886ddef - mode: git-clone -pydantic: - version: X.Y.Z - mode: pip-constraint -# Used by praxis -fiddle: - url: https://github.com/google/fiddle.git - tracking_ref: main - latest_verified_commit: 2a17618c56eb99aa58aa898ae12cbac7cf5c3b30 - mode: pip-vcs -# Used by t5x -airio: - url: https://github.com/google/airio.git - tracking_ref: main - latest_verified_commit: 3e13fd16038f3f376cddd289bd10eef53a4933f4 - mode: pip-vcs -clu: - url: https://github.com/google/CommonLoopUtils.git - tracking_ref: main - latest_verified_commit: c50acb760902c94a89ad3f605edc2d094bc2a7a1 - mode: pip-vcs -dllogger: - url: https://github.com/NVIDIA/dllogger.git - tracking_ref: master - latest_verified_commit: 0540a43971f4a8a16693a9de9de73c1072020769 - mode: pip-vcs -jestimator: - url: https://github.com/google-research/jestimator.git - tracking_ref: main - latest_verified_commit: 6a57d35539f5193a9756a7cb846654e9b221b2e7 - mode: pip-vcs -optax: - url: https://github.com/google-deepmind/optax.git - tracking_ref: main - latest_verified_commit: b4acf8eed4fe26f4b7be5337a8b72cde0ffbc3cf - mode: pip-vcs -seqio: - url: https://github.com/google/seqio.git - tracking_ref: main - latest_verified_commit: 11706e4a1e01a81ea6b3e02c5ad147028d5b94bb - mode: pip-vcs -jax-triton: - url: https://github.com/jax-ml/jax-triton.git - tracking_ref: main - latest_verified_commit: 1999d9b116bf7c5c94f70de4a45b414255366fbe - mode: git-clone -maxtext: - url: https://github.com/google/maxtext.git - tracking_ref: main - latest_verified_commit: 78daad198544def8274dbd656d122fbe6a0e1129 - mode: git-clone -levanter: - url: https://github.com/stanford-crfm/levanter.git - tracking_ref: main - latest_verified_commit: 19829c2c360cc1b8e7975f540e612845e4877a69 - mode: git-clone -haliax: - url: https://github.com/stanford-crfm/haliax.git - tracking_ref: main - latest_verified_commit: 2a696a0c971901ff93afdaa965959d8e3b982ba9 - mode: git-clone -mujoco: - url: https://github.com/google-deepmind/mujoco.git - tracking_ref: main - latest_verified_commit: e95159b4f6d48d114b16a8dc13ad26b3e44bc3e2 - mode: git-clone -grain: - # Used only in ARM t5x builds - url: https://github.com/google/grain.git - tracking_ref: main - latest_verified_commit: 10600a3f5510bcb696a90e72c6e6cb1ac2bb016f - mode: git-clone -mujoco-mpc: - url: https://github.com/google-deepmind/mujoco_mpc.git - tracking_ref: main - latest_verified_commit: 4700f4a13be18398f5aaf6a33ed42e531967e3ae - mode: git-clone -language-to-reward-2023: - url: https://github.com/google-deepmind/language_to_reward_2023.git - tracking_ref: main - latest_verified_commit: abb8e5125e4ecd0da378490b73448c05a694def5 - mode: git-clone -mlperf-logging: - url: https://github.com/mlcommons/logging.git - tracking_ref: master - latest_verified_commit: 99ba37ac267c870d7c6c17e1837aa9180a37cdc1 - mode: pip-vcs -equinox: - url: https://github.com/patrick-kidger/equinox.git - tracking_ref: main - latest_verified_commit: 1e601672d38d2c4d483535070a3572d8e8508a20 - mode: git-clone -grok-1: - url: https://github.com/xai-org/grok-1.git - tracking_ref: main - latest_verified_commit: 7207216386e07206b2083c5c0be88db1add8e631 - mode: git-clone diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 4107e87ae..eec1e25cb 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -10,14 +10,23 @@ pushd /opt/pip-tools.d # twice pip-compile -o requirements.pre $(ls requirements-*.in) -IFS=$'\n' -for line in $(cat requirements.pre | egrep '^[^#].+ @ git\+' || true); do - # VCS installs are of the form "PACKAGE @ git+..." - PACKAGE=$(echo "$line" | awk '{print $1}') - ref=$(yq e ".${PACKAGE}.latest_verified_commit" ${MANIFEST_FILE}) - echo "${line}@${ref}" +# Find the VCS installs, which are of the form +# PACKAGE @ git+GIT_REPO_URL +for line in $(sed -n -e 's/^\([^#].*\) @ git+\(.*\)$/\1=\2/p' requirements.pre); do + PACKAGE="${line%=*}" + REPO_URL="${line#*=}" + ref=$(yq e ".${PACKAGE}.commit" ${MANIFEST_FILE}) + if [[ "${ref}" == "null" ]]; then + # If a commit wasn't pinned in the manifest, get the latest version of the + # default branch of $REPO_URL, pin it, and write it to the manifest. + ref=$(git ls-remote --exit-code "${REPO_URL}" HEAD | awk '{ print $1 }') + touch /opt/manifest.d/pip-finalize.yaml + yq -i e ".${PACKAGE}.commit = \"${ref}\"" /opt/manifest.d/pip-finalize.yaml + yq -i e ".${PACKAGE}.mode = \"pip-vcs\"" /opt/manifest.d/pip-finalize.yaml + yq -i e ".${PACKAGE}.url = \"${REPO_URL}\"" /opt/manifest.d/pip-finalize.yaml + fi + echo "${PACKAGE} @ git+${REPO_URL}@${ref}" done | tee requirements.vcs -unset IFS # Second pip-compile includes one more requirements file that pins all vcs installs # Uses a special env var to let our custom pip impl know to treat the following as diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 9af951070..53f9fbfca 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -54,10 +54,10 @@ jobs: DOCKERFILE: .github/container/Dockerfile.jax RUNNER_SIZE: large EXTRA_BUILD_ARGS: | - URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} - URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} - URLREF_FLAX=${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} - URLREF_TRANSFORMER_ENGINE=${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} + ${{ fromJson(inputs.SOURCE_URLREFS).JAX }} + ${{ fromJson(inputs.SOURCE_URLREFS).XLA }} + ${{ fromJson(inputs.SOURCE_URLREFS).FLAX }} + ${{ fromJson(inputs.SOURCE_URLREFS).TRANSFORMER_ENGINE }} secrets: inherit build-triton: @@ -72,7 +72,7 @@ jobs: BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: triton DOCKERFILE: .github/container/Dockerfile.triton - EXTRA_BUILD_ARGS: URLREF_JAX_TRITON=${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} + EXTRA_BUILD_ARGS: ${{ fromJson(inputs.SOURCE_URLREFS).JAX_TRITON }} secrets: inherit build-equinox: @@ -86,8 +86,7 @@ jobs: BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: equinox DOCKERFILE: .github/container/Dockerfile.equinox - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + EXTRA_BUILD_ARGS: ${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} secrets: inherit build-maxtext: @@ -102,8 +101,7 @@ jobs: BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: maxtext DOCKERFILE: .github/container/Dockerfile.maxtext.amd64 - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + EXTRA_BUILD_ARGS: ${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} secrets: inherit build-levanter: @@ -118,8 +116,8 @@ jobs: CONTAINER_NAME: levanter DOCKERFILE: .github/container/Dockerfile.levanter EXTRA_BUILD_ARGS: | - URLREF_LEVANTER=${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} - URLREF_HALIAX=${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} + ${{ fromJson(inputs.SOURCE_URLREFS).LEVANTER }} + ${{ fromJson(inputs.SOURCE_URLREFS).HALIAX }} secrets: inherit build-upstream-t5x: @@ -133,8 +131,7 @@ jobs: BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: upstream-t5x DOCKERFILE: .github/container/Dockerfile.t5x.${{ inputs.ARCHITECTURE }} - EXTRA_BUILD_ARGS: | - URLREF_T5X=${{ fromJson(inputs.SOURCE_URLREFS).T5X }} + EXTRA_BUILD_ARGS: ${{ fromJson(inputs.SOURCE_URLREFS).T5X }} secrets: inherit build-upstream-pax: @@ -149,8 +146,8 @@ jobs: CONTAINER_NAME: upstream-pax DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} EXTRA_BUILD_ARGS: | - URLREF_PAXML=${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} - URLREF_PRAXIS=${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} + ${{ fromJson(inputs.SOURCE_URLREFS).PAXML }} + ${{ fromJson(inputs.SOURCE_URLREFS).PRAXIS }} secrets: inherit build-rosetta-t5x: @@ -184,8 +181,7 @@ jobs: BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} CONTAINER_NAME: grok DOCKERFILE: .github/container/Dockerfile.grok - EXTRA_BUILD_ARGS: | - URLREF_GROK_1=${{ fromJson(inputs.SOURCE_URLREFS).GROK_1 }} + EXTRA_BUILD_ARGS: ${{ fromJson(inputs.SOURCE_URLREFS).GROK_1 }} secrets: inherit collect-docker-tags: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5abbd727e..f25ad7985 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -21,8 +21,8 @@ on: BUMP_MANIFEST: type: boolean description: Bump git repos in manifest.yaml to head of tree? - default: false - required: false + default: true + required: true MERGE_BUMPED_MANIFEST: type: boolean description: "(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch" @@ -88,7 +88,11 @@ jobs: id: manifest-branch shell: bash -x -e {0} run: | - BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'false' }} + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + BUMP_MANIFEST="${{ inputs.BUMP_MANIFEST }}" + else + BUMP_MANIFEST="true" + fi MERGE_BUMPED_MANIFEST=${{ github.event_name == 'schedule' || inputs.MERGE_BUMPED_MANIFEST || 'false' }} # Prepend nightly manifest branch with "z" to make it appear at the end if [[ "$BUMP_MANIFEST" == "true" ]]; then @@ -155,7 +159,7 @@ jobs: # converts manifest yaml to a json object of {SOFTWARE_NAME: URL#REF, ...} urlrefs=$( cat .github/container/manifest.yaml |\ - yq -o=json 'to_entries | .[] | select(.value.mode == "git-clone") | {( .key | upcase | sub("-", "_") ): .value.url + "#" + .value.latest_verified_commit}' |\ + yq -o=json 'to_entries | .[] | select(.value.mode == "git-clone") | {( .key | upcase | sub("-", "_") ): "URLREF_" + ( .key | upcase | sub("-", "_") ) + "=" + .value.url + "#" + .value.commit}' |\ jq -c -s 'add' ) # SOURCE_OVERRIDES is a comma-separated list of package=urlref pairs @@ -163,7 +167,7 @@ jobs: for override in "${overrides[@]}"; do PACKAGE=$(cut -d= -f 1 <<< "${override}" | tr '[:lower:]' '[:upper:]' | tr '-' '_') URLREF=$(cut -d= -f 2- <<< "${override}") - urlrefs=$(echo "$urlrefs" | jq -c ". + {\"$PACKAGE\": \"$URLREF\"}") + urlrefs=$(echo "$urlrefs" | jq -c ". + {\"$PACKAGE\": \"URLREF_${PACKAGE}=$URLREF\"}") done echo "SOURCE_URLREFS=${urlrefs}" >> $GITHUB_OUTPUT diff --git a/rosetta/Dockerfile.pax b/rosetta/Dockerfile.pax index 140ab2b5f..86806bdf5 100644 --- a/rosetta/Dockerfile.pax +++ b/rosetta/Dockerfile.pax @@ -42,7 +42,7 @@ cp -r /mnt/jax-toolbox/rosetta /opt/rosetta if [[ -n "${UPDATED_TE_REF}" ]]; then TE_INSTALL_DIR=/opt/transformer-engine - yq e ".transformer-engine.latest_verified_commit = \"${UPDATED_TE_REF}\"" -i $MANIFEST_FILE + yq e ".transformer-engine.commit = \"${UPDATED_TE_REF}\"" -i /opt/manifest.d/git-clone.yaml # Install from source instead of pre-built wheel sed -i -E 's@( file:///opt/transformer-engine)/dist/[^ ]*@\1@' /opt/pip-tools.d/requirements-te.in git -C $TE_INSTALL_DIR fetch -a diff --git a/rosetta/Dockerfile.t5x b/rosetta/Dockerfile.t5x index 468bc91c6..7ad552695 100644 --- a/rosetta/Dockerfile.t5x +++ b/rosetta/Dockerfile.t5x @@ -42,7 +42,7 @@ cp -r /mnt/jax-toolbox/rosetta /opt/rosetta if [[ -n "${UPDATED_TE_REF}" ]]; then TE_INSTALL_DIR=/opt/transformer-engine - yq e ".transformer-engine.latest_verified_commit = \"${UPDATED_TE_REF}\"" -i $MANIFEST_FILE + yq e ".transformer-engine.commit = \"${UPDATED_TE_REF}\"" -i /opt/manifest.d/git-clone.yaml # Install from source instead of pre-built wheel sed -i -E 's@( file:///opt/transformer-engine)/dist/[^ ]*@\1@' /opt/pip-tools.d/requirements-te.in git -C $TE_INSTALL_DIR fetch -a diff --git a/rosetta/tests/extra-only-distribution.sh b/rosetta/tests/extra-only-distribution.sh index 0b360fde6..1b4655bb8 100755 --- a/rosetta/tests/extra-only-distribution.sh +++ b/rosetta/tests/extra-only-distribution.sh @@ -42,8 +42,6 @@ t5x: mirror_url: https://github.com/nvjax-svc-0/t5x.git extra_dir: $extra_tmp tracking_ref: main - latest_verified_commit: $DISTRIBUTION_BASE_REF - mode: git-clone patches: $EXTRA_PATCH_BRANCH: null EOF diff --git a/rosetta/tests/local-patch-distribution.sh b/rosetta/tests/local-patch-distribution.sh index 616b92614..cb2a062d1 100755 --- a/rosetta/tests/local-patch-distribution.sh +++ b/rosetta/tests/local-patch-distribution.sh @@ -29,8 +29,6 @@ t5x: url: https://github.com/google-research/t5x.git mirror_url: https://github.com/nvjax-svc-0/t5x.git tracking_ref: main - latest_verified_commit: $DISTRIBUTION_BASE_REF - mode: git-clone patches: pull/1372/head: null EOF diff --git a/rosetta/tests/mirror-only-distribution.sh b/rosetta/tests/mirror-only-distribution.sh index aae611219..234acce2c 100755 --- a/rosetta/tests/mirror-only-distribution.sh +++ b/rosetta/tests/mirror-only-distribution.sh @@ -28,8 +28,6 @@ t5x: url: https://github.com/google-research/t5x.git mirror_url: https://github.com/nvjax-svc-0/t5x.git tracking_ref: main - latest_verified_commit: $DISTRIBUTION_BASE_REF - mode: git-clone patches: mirror/pull/4/head: null EOF diff --git a/rosetta/tests/upstream-only-distribution.sh b/rosetta/tests/upstream-only-distribution.sh index ca0ec10a8..1059d067c 100755 --- a/rosetta/tests/upstream-only-distribution.sh +++ b/rosetta/tests/upstream-only-distribution.sh @@ -28,8 +28,6 @@ t5x: url: https://github.com/google-research/t5x.git mirror_url: https://github.com/nvjax-svc-0/t5x.git tracking_ref: main - latest_verified_commit: $DISTRIBUTION_BASE_REF - mode: git-clone patches: pull/1372/head: null EOF