Merge remote-tracking branch 'origin/main' into zx_support_OVIS

ZX-ModelCloud · Dec 17, 2024 · f77d220 · f77d220
2 parents 9314d73 + b13d23b
commit f77d220
Show file tree

Hide file tree

Showing 164 changed files with 18,645 additions and 6,705 deletions.
diff --git a/.buildkite/release.sh b/.buildkite/release.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+export RUNNER=10.0.14.248
+
+function install_requirements() {
+  bash -c "$(curl -L http://$RUNNER/scripts/compiler/init_env.sh)" @ $CUDA $TORCH $PYTHON
+  uv pip install auto_round optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://$RUNNER/simple/ --trusted-host $RUNNER
+  uv pip install transformers -U -i http://$RUNNER/simple/ --trusted-host $RUNNER
+}
+
+function compile() {
+    python setup.py bdist_wheel
+
+    ls -ahl dist
+
+    whl=$(ls -t dist/*.whl | head -n 1 | xargs basename)
+    sha256=$(sha256sum dist/$whl)
+    echo "hash=$sha256"
+
+    twine check dist/$whl
+    pip install dist/$whl
+
+    # upload to artifact
+    sha256=$(sha256sum dist/$whl)
+    response=$(curl -s -F "runid=$RUN_ID" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "sha256=$sha256" -F "file=@dist/${{ env.WHL_NAME }}" http://${{ needs.check-vm.outputs.ip }
+    }}/gpu/whl/upload)
+    if [ "$response" -eq 0 ]; then
+      echo "UPLOADED=1" >> $GITHUB_ENV
+    fi
+}
+
+cuda=$1
+torch=$2
+python=$3
+run_id=$4
+repo=$5
+ref=$6
+
+echo "CUDA Version: $cuda"
+export CUDA=$cuda
+echo "Torch Version: $torch"
+export TORCH=$torch
+echo "Python Version: $python"
+export PYTHON=$python
+echo "Run id: $run_id"
+export RUN_ID=$run_id
+echo "Repo: $repo"
+export REPO=$repo
+echo "Ref: $ref"
+export REF=$ref
+
+install_requirements
+compile
diff --git a/.buildkite/unit_tests.sh b/.buildkite/unit_tests.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+export RUNNER=10.0.14.248
+
+function install_requirements() {
+  bash -c "$(curl -L http://$RUNNER/scripts/compiler/init_env.sh)" @ $CUDA $TORCH $PYTHON
+  uv pip install auto_round optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://$RUNNER/simple/ --trusted-host $RUNNER
+  uv pip install transformers -U -i http://$RUNNER/simple/ --trusted-host $RUNNER
+}
+
+function compile() {
+    python setup.py bdist_wheel
+
+    ls -ahl dist
+
+    whl=$(ls -t dist/*.whl | head -n 1 | xargs basename)
+    sha256=$(sha256sum dist/$whl)
+    echo "hash=$sha256"
+
+    twine check dist/$whl
+    pip install dist/$whl
+
+    # upload to artifact
+    # xxx
+}
+
+function clear_cache() {
+  if [ "$ERROR" == "1" ] && [ "$BITBLAS" == "1" ] && [ "$ERROR" == "1" ]; then
+      rm -rf ~/.cache/bitblas/nvidia/geforce-rtx-4090
+      echo "clear bitblas cache"
+  fi
+}
+
+function find_gpu() {
+  timestamp=$(date +%s%3N)
+  gpu_id=-1
+
+  while [ "$gpu_id" -lt 0 ]; do
+    gpu_id=$(curl -s "http://$RUNNER/gpu/get?id=$RUN_ID&timestamp=$timestamp")
+
+    if [ "$gpu_id" -lt 0 ]; then
+      echo "http://$RUNNER/gpu/get?id=$RUN_ID&timestamp=$timestamp returned $gpu_id"
+      echo "No available GPU, waiting 5 seconds..."
+      sleep 5
+    else
+      echo "Allocated GPU ID: $gpu_id"
+    fi
+  done
+  export CUDA_VISIBLE_DEVICES=$gpu_id
+  export STEP_TIMESTAMP=$timestamp
+  echo CUDA_VISIBLE_DEVICES set to $CUDA_VISIBLE_DEVICES, timestamp=$STEP_TIMESTAMP
+}
+
+function release_gpu() {
+    curl -X GET "http://$RUNNER/gpu/release?id=$RUN_ID&gpu=$CUDA_VISIBLE_DEVICES&timestamp=$STEP_TIMESTAMP"
+}
+
+function test() {
+  echo "current dir:"
+  pwd
+  echo "===="
+  ls
+  echo "===="
+  ls ..
+  echo "===="
+  pytest --durations=0 tests/$TEST_NAME.py || { export ERROR=1; exit 1; }
+}
+
+if [ "$#" -ne 6 ]; then
+    echo "Usage: $0 <test_name> <cuda> <torch> <python> <run_id> <docker>"
+    exit 1
+fi
+
+test_name=$1
+cuda=$2
+torch=$3
+python=$4
+run_id=$5
+docker=$6
+
+echo "Test Name: $test_name"
+export TEST_NAME="${test_name%.py}"
+echo "CUDA Version: $cuda"
+export CUDA=$cuda
+echo "Torch Version: $torch"
+export TORCH=$torch
+echo "Python Version: $python"
+export PYTHON=$python
+echo "Run id: $run_id"
+export RUN_ID=$run_id
+echo "Docker image: $docker"
+export DOCKER_IMAGE=$docker
+
+install_requirements
+compile
+find_gpu
+test || true
+release_gpu
+clear_cache
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -28,7 +28,7 @@ on:
       max-parallel:
         description: 'max parallel jobs'
         required: false
-        default: '6'
+        default: '12'
       upload_release:
         description: 'upload to release (it only works with a tag ref)'
         type: boolean
@@ -43,7 +43,7 @@ on:
 env:
   CUDA_DEVICE_ORDER: PCI_BUS_ID
   RUNNER: 10.0.14.248
-  TORCH_CUDA_ARCH_LIST: '6.0 6.1 6.2 7.0 7.5 8.0 8.6 8.9 9.0'
+  TORCH_CUDA_ARCH_LIST: '8.0 8.6 8.9 9.0'
   CUDA_RELEASE: 1
   CI: 1
   GPTQMODEL_FORCE_BUILD: 1
@@ -82,7 +82,7 @@ jobs:
           echo "ip: $ip"
 
           max_p=${{ github.event.inputs.max-parallel }}
-          max_p="{\"size\": ${max_p:-6}}"
+          max_p="{\"size\": ${max_p:-12}}"
           echo "max-parallel=$max_p" >> "$GITHUB_OUTPUT"
           echo "max-parallel=$max_p"
 
@@ -121,7 +121,7 @@ jobs:
   release:
     strategy:
       fail-fast: false
-      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 6 }}
+      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 12 }}
       matrix:
         tag: ${{ fromJSON(needs.check-vm.outputs.task_list) }}
     runs-on: self-hosted
@@ -162,7 +162,7 @@ jobs:
           cuda_version=$(echo ${{ matrix.tag }} | grep -oP 'cuda\K[0-9.]+')
           torch_version=$(echo ${{ matrix.tag }} | grep -oP 'torch\K[0-9.]+')
           python_version=$(echo ${{ matrix.tag }} | grep -oP 'python\K[0-9.]+')
-          bash -c "$(curl -L http://${RUNNER}/scripts/compiler/init_env.sh)" @ $cuda_version $torch_version $python_version
+          bash -c "$(curl -L http://${RUNNER}/files/scripts/init_compiler.sh)" @ $cuda_version $torch_version $python_version
 
       - name: Compile
         run: python setup.py bdist_wheel
@@ -189,6 +189,7 @@ jobs:
         uses: actions/upload-artifact@v4
         continue-on-error: ${{ env.UPLOADED == '1' }}
         with:
+          overwrite: true
           name: ${{ env.WHL_NAME }}
           path: dist/${{ env.WHL_NAME }}