Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into zx_support_OVIS
Browse files Browse the repository at this point in the history
  • Loading branch information
ZX-ModelCloud committed Dec 17, 2024
2 parents 9314d73 + b13d23b commit f77d220
Show file tree
Hide file tree
Showing 164 changed files with 18,645 additions and 6,705 deletions.
52 changes: 52 additions & 0 deletions .buildkite/release.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash
export RUNNER=10.0.14.248

function install_requirements() {
bash -c "$(curl -L http://$RUNNER/scripts/compiler/init_env.sh)" @ $CUDA $TORCH $PYTHON
uv pip install auto_round optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://$RUNNER/simple/ --trusted-host $RUNNER
uv pip install transformers -U -i http://$RUNNER/simple/ --trusted-host $RUNNER
}

function compile() {
python setup.py bdist_wheel

ls -ahl dist

whl=$(ls -t dist/*.whl | head -n 1 | xargs basename)
sha256=$(sha256sum dist/$whl)
echo "hash=$sha256"

twine check dist/$whl
pip install dist/$whl

# upload to artifact
sha256=$(sha256sum dist/$whl)
response=$(curl -s -F "runid=$RUN_ID" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "sha256=$sha256" -F "file=@dist/${{ env.WHL_NAME }}" http://${{ needs.check-vm.outputs.ip }
}}/gpu/whl/upload)
if [ "$response" -eq 0 ]; then
echo "UPLOADED=1" >> $GITHUB_ENV
fi
}

cuda=$1
torch=$2
python=$3
run_id=$4
repo=$5
ref=$6

echo "CUDA Version: $cuda"
export CUDA=$cuda
echo "Torch Version: $torch"
export TORCH=$torch
echo "Python Version: $python"
export PYTHON=$python
echo "Run id: $run_id"
export RUN_ID=$run_id
echo "Repo: $repo"
export REPO=$repo
echo "Ref: $ref"
export REF=$ref

install_requirements
compile
98 changes: 98 additions & 0 deletions .buildkite/unit_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/bin/bash
export RUNNER=10.0.14.248

function install_requirements() {
bash -c "$(curl -L http://$RUNNER/scripts/compiler/init_env.sh)" @ $CUDA $TORCH $PYTHON
uv pip install auto_round optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://$RUNNER/simple/ --trusted-host $RUNNER
uv pip install transformers -U -i http://$RUNNER/simple/ --trusted-host $RUNNER
}

function compile() {
python setup.py bdist_wheel

ls -ahl dist

whl=$(ls -t dist/*.whl | head -n 1 | xargs basename)
sha256=$(sha256sum dist/$whl)
echo "hash=$sha256"

twine check dist/$whl
pip install dist/$whl

# upload to artifact
# xxx
}

function clear_cache() {
if [ "$ERROR" == "1" ] && [ "$BITBLAS" == "1" ] && [ "$ERROR" == "1" ]; then
rm -rf ~/.cache/bitblas/nvidia/geforce-rtx-4090
echo "clear bitblas cache"
fi
}

function find_gpu() {
timestamp=$(date +%s%3N)
gpu_id=-1

while [ "$gpu_id" -lt 0 ]; do
gpu_id=$(curl -s "http://$RUNNER/gpu/get?id=$RUN_ID&timestamp=$timestamp")

if [ "$gpu_id" -lt 0 ]; then
echo "http://$RUNNER/gpu/get?id=$RUN_ID&timestamp=$timestamp returned $gpu_id"
echo "No available GPU, waiting 5 seconds..."
sleep 5
else
echo "Allocated GPU ID: $gpu_id"
fi
done
export CUDA_VISIBLE_DEVICES=$gpu_id
export STEP_TIMESTAMP=$timestamp
echo CUDA_VISIBLE_DEVICES set to $CUDA_VISIBLE_DEVICES, timestamp=$STEP_TIMESTAMP
}

function release_gpu() {
curl -X GET "http://$RUNNER/gpu/release?id=$RUN_ID&gpu=$CUDA_VISIBLE_DEVICES&timestamp=$STEP_TIMESTAMP"
}

function test() {
echo "current dir:"
pwd
echo "===="
ls
echo "===="
ls ..
echo "===="
pytest --durations=0 tests/$TEST_NAME.py || { export ERROR=1; exit 1; }
}

if [ "$#" -ne 6 ]; then
echo "Usage: $0 <test_name> <cuda> <torch> <python> <run_id> <docker>"
exit 1
fi

test_name=$1
cuda=$2
torch=$3
python=$4
run_id=$5
docker=$6

echo "Test Name: $test_name"
export TEST_NAME="${test_name%.py}"
echo "CUDA Version: $cuda"
export CUDA=$cuda
echo "Torch Version: $torch"
export TORCH=$torch
echo "Python Version: $python"
export PYTHON=$python
echo "Run id: $run_id"
export RUN_ID=$run_id
echo "Docker image: $docker"
export DOCKER_IMAGE=$docker

install_requirements
compile
find_gpu
test || true
release_gpu
clear_cache
11 changes: 6 additions & 5 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ on:
max-parallel:
description: 'max parallel jobs'
required: false
default: '6'
default: '12'
upload_release:
description: 'upload to release (it only works with a tag ref)'
type: boolean
Expand All @@ -43,7 +43,7 @@ on:
env:
CUDA_DEVICE_ORDER: PCI_BUS_ID
RUNNER: 10.0.14.248
TORCH_CUDA_ARCH_LIST: '6.0 6.1 6.2 7.0 7.5 8.0 8.6 8.9 9.0'
TORCH_CUDA_ARCH_LIST: '8.0 8.6 8.9 9.0'
CUDA_RELEASE: 1
CI: 1
GPTQMODEL_FORCE_BUILD: 1
Expand Down Expand Up @@ -82,7 +82,7 @@ jobs:
echo "ip: $ip"
max_p=${{ github.event.inputs.max-parallel }}
max_p="{\"size\": ${max_p:-6}}"
max_p="{\"size\": ${max_p:-12}}"
echo "max-parallel=$max_p" >> "$GITHUB_OUTPUT"
echo "max-parallel=$max_p"
Expand Down Expand Up @@ -121,7 +121,7 @@ jobs:
release:
strategy:
fail-fast: false
max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 6 }}
max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 12 }}
matrix:
tag: ${{ fromJSON(needs.check-vm.outputs.task_list) }}
runs-on: self-hosted
Expand Down Expand Up @@ -162,7 +162,7 @@ jobs:
cuda_version=$(echo ${{ matrix.tag }} | grep -oP 'cuda\K[0-9.]+')
torch_version=$(echo ${{ matrix.tag }} | grep -oP 'torch\K[0-9.]+')
python_version=$(echo ${{ matrix.tag }} | grep -oP 'python\K[0-9.]+')
bash -c "$(curl -L http://${RUNNER}/scripts/compiler/init_env.sh)" @ $cuda_version $torch_version $python_version
bash -c "$(curl -L http://${RUNNER}/files/scripts/init_compiler.sh)" @ $cuda_version $torch_version $python_version
- name: Compile
run: python setup.py bdist_wheel
Expand All @@ -189,6 +189,7 @@ jobs:
uses: actions/upload-artifact@v4
continue-on-error: ${{ env.UPLOADED == '1' }}
with:
overwrite: true
name: ${{ env.WHL_NAME }}
path: dist/${{ env.WHL_NAME }}

Expand Down
Loading

0 comments on commit f77d220

Please sign in to comment.