diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index bb0791dc9..c2e67d9b1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -22,13 +22,13 @@ on: required: false type: number target: - description: 'CUDA Torch Python version separated by space, check http://10.0.14.248/gpu/runner/docker to get all supported combinations' + description: 'CUDA Torch Python version separated by space, check http://10.0.13.31/gpu/runner/docker to get all supported combinations' required: false default: '' max-parallel: description: 'max parallel jobs' required: false - default: '12' + default: '10' upload_release: description: 'upload to release (it only works with a tag ref)' type: boolean @@ -57,7 +57,7 @@ concurrency: jobs: check-vm: - runs-on: self-hosted + runs-on: [self-hosted, Linux] container: image: modelcloud/gptqmodel:alpine-ci-v1 outputs: @@ -82,7 +82,7 @@ jobs: echo "ip: $ip" max_p=${{ github.event.inputs.max-parallel }} - max_p="{\"size\": ${max_p:-12}}" + max_p="{\"size\": ${max_p:-10}}" echo "max-parallel=$max_p" >> "$GITHUB_OUTPUT" echo "max-parallel=$max_p" @@ -121,10 +121,10 @@ jobs: release: strategy: fail-fast: false - max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 12 }} + max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }} matrix: tag: ${{ fromJSON(needs.check-vm.outputs.task_list) }} - runs-on: self-hosted + runs-on: [self-hosted, Linux] needs: - check-vm if: needs.check-vm.outputs.task_list != '' && !cancelled() @@ -162,7 +162,7 @@ jobs: cuda_version=$(echo ${{ matrix.tag }} | grep -oP 'cuda\K[0-9.]+') torch_version=$(echo ${{ matrix.tag }} | grep -oP 'torch\K[0-9.]+') python_version=$(echo ${{ matrix.tag }} | grep -oP 'python\K[0-9.]+') - bash -c "$(curl -L http://${RUNNER}/files/scripts/init_compiler.sh)" @ $cuda_version $torch_version $python_version + bash -c "$(curl -L http://${RUNNER}/scripts/env/init_compiler.sh)" @ $cuda_version $torch_version $python_version - name: Compile run: python setup.py bdist_wheel @@ -204,7 +204,7 @@ jobs: overwrite: true release-source: - runs-on: self-hosted + runs-on: [self-hosted, Linux] needs: - check-vm container: @@ -269,11 +269,12 @@ jobs: - name: Waiting for confirmation if: (github.event_name == 'release' || github.event.inputs.upload_pypi == 'true') && !cancelled() run: | + timestamp=$(date +%s%3N) + echo "open http://${RUNNER}/gpu/ci/confirm?id=${{ github.run_id }}×tamp=$timestamp&confirmed=1 to confirm releasing to pypi" for i in {1..5}; do echo "."; done echo "click http://${RUNNER}/gpu/ci/confirm?id=${{ github.run_id }}×tamp=$timestamp&denied=1 to DENY" - timestamp=$(date +%s%3N) status=-1 while [ "$status" -lt 0 ]; do diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 1f9d8b550..fb1f5d494 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -36,15 +36,21 @@ on: description: 'max parallel jobs' required: false default: '10' + m4-only: + description: 'only run m4(test only)' + type: boolean + required: false + default: false env: CUDA_DEVICE_ORDER: PCI_BUS_ID CUDA_VISIBLE_DEVICES: 0 + TORCH_CUDA_ARCH_LIST: '8.9' MAX_JOBS: 8 RUNNER: 10.0.14.248 TRANSFORMERS_DIFF_TESTS: "models/test_internlm,models/test_internlm2_5,models/test_xverse" TORCH_2_5_TESTS: "test_evalplus,test_perplexity,test_q4_ipex.py,test_ipex_xpu.py,test_save_loaded_quantized_model,test_quant_formats,models/test_hymba" - IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral" + IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral,test_q4_torch_apple" GPTQMODEL_FORCE_BUILD: 1 repo: ${{ github.event.inputs.repo || github.repository }} ref: ${{ github.event.inputs.ref || github.ref }} @@ -55,7 +61,7 @@ concurrency: jobs: check-vm: - runs-on: self-hosted + runs-on: [self-hosted, Linux] container: image: modelcloud/gptqmodel:alpine-ci-v1 outputs: @@ -92,9 +98,9 @@ jobs: list-test-files: runs-on: ubuntu-latest + if: github.event.inputs.m4-only != 'true' outputs: torch-2-5-files: ${{ steps.files.outputs.torch-2-5-files }} - gpu-files: ${{ steps.files.outputs.gpu-files }} transformers-files: ${{ steps.files.outputs.transformers-files }} steps: @@ -121,44 +127,44 @@ jobs: import os import re - cpu_file_list = [f.strip().removesuffix('.py') for f in '${TORCH_2_5_TESTS}'.split(',') if f.strip()] - test_files_list = [f.strip().removesuffix('.py') for f in '${{ github.event.inputs.test_names }}'.split(',') if f.strip()] - cpu_test_files = [f for f in cpu_file_list if not test_files_list or f in test_files_list] - transformers_diff_list = [f.strip().removesuffix('.py') for f in '${TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()] - transformers_test_files = [f for f in transformers_diff_list if not test_files_list or f in test_files_list] + TRANSFORMERS_DIFF_TESTS = '${TRANSFORMERS_DIFF_TESTS}' + IGNORED_TEST_FILES = '${IGNORED_TEST_FILES}' + + TEST_NAMES='${{ github.event.inputs.test_names }}' + TEST_REGEX='${{ github.event.inputs.test_regex }}' + + input_test_files_list = [f.strip().removesuffix('.py') for f in TEST_NAMES.split(',') if f.strip()] - all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in '${IGNORED_TEST_FILES}'] + transformers_test_files = [f.strip().removesuffix('.py') for f in f'{TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()] + transformers_test_files = [f for f in transformers_test_files if not input_test_files_list or f in input_test_files_list] - all_tests_models = ['models/'+f.removesuffix('.py') for f in os.listdir('tests/models') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in '${IGNORED_TEST_FILES}'] - all_tests = [item for item in all_tests+all_tests_models if item.strip().removesuffix('.py') not in '${TORCH_2_5_TESTS}'] + all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in f'{IGNORED_TEST_FILES}'] + all_tests_models = ['models/'+f.removesuffix('.py') for f in os.listdir('tests/models') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in f'{IGNORED_TEST_FILES}'] - gpu_test_files = [f for f in all_tests if f not in cpu_file_list and (not test_files_list or f in test_files_list) and f not in transformers_diff_list] + torch_2_5_test_files = [f for f in all_tests+all_tests_models if (not input_test_files_list or f in input_test_files_list) and f not in transformers_test_files] - cpu_test_files = [test for test in cpu_test_files if re.match(r'${{ github.event.inputs.test_regex }}', test)] - gpu_test_files = [test for test in gpu_test_files if re.match(r'${{ github.event.inputs.test_regex }}', test)] - transformers_test_files = [test for test in transformers_test_files if re.match(r'${{ github.event.inputs.test_regex }}', test)] + torch_2_5_test_files = [test for test in torch_2_5_test_files if re.match(rf'{TEST_REGEX}', test)] + transformers_test_files = [test for test in transformers_test_files if re.match(rf'{TEST_REGEX}', test)] - print(f'{json.dumps(cpu_test_files)}|{json.dumps(gpu_test_files)}|{json.dumps(transformers_test_files)}') + print(f'{json.dumps(torch_2_5_test_files)}|{json.dumps(transformers_test_files)}') " test_files=$(python3 -c "$script") - IFS='|' read -r cpu_test_files gpu_test_files transformers_test_files <<< "$test_files" - echo "torch-2-5-files=$cpu_test_files" >> "$GITHUB_OUTPUT" - echo "gpu-files=$gpu_test_files" >> "$GITHUB_OUTPUT" + IFS='|' read -r torch_2_5_test_files transformers_test_files <<< "$test_files" + echo "torch-2-5-files=$torch_2_5_test_files" >> "$GITHUB_OUTPUT" echo "transformers-files=$transformers_test_files" >> "$GITHUB_OUTPUT" echo "Test files: $test_files" - echo "CPU Test files: $cpu_test_files" - echo "GPU Test files: $gpu_test_files" + echo "Torch 2.5 Test files: $torch_2_5_test_files" echo "Transformers Test files: $transformers_test_files" echo "Ignored Test files: $IGNORED_TEST_FILES" build: - runs-on: self-hosted + runs-on: [self-hosted, Linux] needs: check-vm - if: github.event.inputs.artifact_id == '' && !cancelled() + if: github.event.inputs.m4-only != 'true' && github.event.inputs.artifact_id == '' && !cancelled() container: - image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.4.1 + image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5 steps: - name: Checkout Codes @@ -186,13 +192,8 @@ jobs: nvcc --version echo "== torch ==" pip show torch - - - name: Install requirements - run: | - echo "===== init test env =====" - bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11 - echo "===== install transformers typing-extensions =====" - uv pip install transformers typing-extensions -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} + echo "##### pip list #####" + pip list - name: Compile timeout-minutes: 35 @@ -222,148 +223,6 @@ jobs: name: dist path: dist - torch2_4: - needs: - - build - - list-test-files - - check-vm - runs-on: self-hosted - if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.gpu-files != '[]' - container: - image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.4.1 - volumes: - - /home/ci/models:/monster/data/model - strategy: - fail-fast: false - max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }} - matrix: - test_script: ${{ fromJSON(needs.list-test-files.outputs.gpu-files) }} - - steps: - - name: Checkout Codes - uses: actions/checkout@v4 - with: - repository: ${{ github.event.inputs.repo }} - ref: ${{ github.event.inputs.ref }} - - - name: Fetch PR by number - if: ${{ github.event.inputs.pr_number != 0 }} - run: | - PR_NUMBER=${{ github.event.inputs.pr_number }} - echo "pr number $PR_NUMBER" - git config --global --add safe.directory $(pwd) - git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} - git checkout pr-${PR_NUMBER} - - - name: Print Env - run: | - echo "== pyenv ==" - pyenv versions - echo "== python ==" - python --version - echo "== nvcc ==" - nvcc --version - echo "== torch ==" - pip show torch - - if [ -n "${{ github.event.inputs.artifact_id }}" ]; then - run_id="${{ github.event.inputs.artifact_id }}" - else - run_id="${{ github.run_id }}" - fi - echo "RUN_ID=$run_id" >> $GITHUB_ENV - echo "RUN_ID=${run_id}" - - if grep -q "bitblas" tests/${{ matrix.test_script }}.py; then - echo "BITBLAS=1" >> $GITHUB_ENV - fi - - - name: Download wheel - continue-on-error: true - run: | - file_name=$(curl -s -F "runid=${{ needs.check-vm.outputs.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "fuzz=1" "http://${{ needs.check-vm.outputs.ip }}/gpu/whl/download") - - if echo "$file_name" | grep -q "gptqmodel"; then - mkdir dist || true - cd dist - curl -s -O http://${{ needs.check-vm.outputs.ip }}/whl/${{ env.repo }}/${{ needs.check-vm.outputs.run_id }}/$file_name - ls -ahl . - sha256=$(sha256sum $file_name) - echo "sha256=$sha256" - echo "DOWNLOADED=1" >> $GITHUB_ENV - fi - - - name: Download artifact - if: env.DOWNLOADED == '' && !cancelled() - uses: actions/download-artifact@v4 - with: - name: dist - path: dist - run-id: ${{ needs.check-vm.outputs.run_id }} - - - name: Install wheel - run: | - echo "===== install optimum bitblas =====" - uv pip install optimum bitblas==0.0.1.dev13 uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} - echo "===== install dist/whl =====" - uv pip install dist/*.whl - if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then - echo "===== install auto_round =====" - uv pip install auto_round - fi - bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11 - if [ "${{ matrix.test_script }}" == "models/test_cohere2" ]; then - echo "===== install transformers from git =====" - uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5 - else - echo "===== install transformers from pypi =====" - uv pip install transformers -U - fi - echo "===== install typing-extensions numpy==1.26.4 =====" - uv pip install typing-extensions numpy==1.26.4 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} - - - name: Check platform - run: | - ip=${RUNNER} - echo "-----------" - pip show torch - echo "-----------" - nvcc --version - - - name: Find suitable GPU - run: | - timestamp=$(date +%s%3N) - gpu_id=-1 - - while [ "$gpu_id" -lt 0 ]; do - gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}×tamp=$timestamp") - - if [ "$gpu_id" -lt 0 ]; then - echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}×tamp=$timestamp returned $gpu_id" - echo "No available GPU, waiting 5 seconds..." - sleep 5 - else - echo "Allocated GPU ID: $gpu_id" - fi - done - echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV - echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV - echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp" - - - name: Run tests - if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }} - run: pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; } - - - name: Clear cache - if: failure() && env.BITBLAS == '1' && env.ERROR == '1' - run: | - rm -rf ~/.cache/bitblas/nvidia/geforce-rtx-4090 - echo "clear bitblas cache" - - - name: Release GPU - if: always() - run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}×tamp=${{ env.STEP_TIMESTAMP }}" - - name: Clean cache if: always() run: pip cache purge && uv cache clean @@ -373,10 +232,10 @@ jobs: - build - list-test-files - check-vm - runs-on: self-hosted - if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.transformers-files != '[]' + runs-on: [self-hosted, Linux] + if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]' container: - image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.4.1 + image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5 volumes: - /home/ci/models:/monster/data/model strategy: @@ -410,6 +269,8 @@ jobs: nvcc --version echo "== torch ==" pip show torch + echo "== pip list ==" + pip list - name: Download wheel continue-on-error: true @@ -441,7 +302,6 @@ jobs: echo "===== install dist/whl =====" uv pip install dist/*.whl echo "===== init test env =====" - bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11 echo "===== install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 =====" uv pip install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }} if [ "${{ matrix.test_script }}" == "models/test_xverse" ]; then @@ -453,7 +313,12 @@ jobs: uv pip install auto_round fi + - name: Clean cache + if: always() + run: pip cache purge && uv cache clean + - name: Find suitable GPU + if: ${{ !contains(matrix.test_script, 'ipex') && !cancelled() }} run: | timestamp=$(date +%s%3N) gpu_id=-1 @@ -475,25 +340,26 @@ jobs: - name: Run tests if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }} - run: pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; } + run: | + start_time=$(date +%s) + pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; } + execution_time=$(( $(date +%s) - start_time )) + echo "$((execution_time / 60))m $((execution_time % 60))s" + curl "http://${{ needs.check-vm.outputs.ip }}/gpu/vram?gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second" - name: Release GPU - if: always() + if: always() && !contains(matrix.test_script, 'ipex') run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}×tamp=${{ env.STEP_TIMESTAMP }}" - - name: Clean cache - if: always() - run: pip cache purge && uv cache clean - torch2_5: needs: - build - list-test-files - check-vm - runs-on: self-hosted - if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.torch-2-5-files != '[]' + runs-on: [self-hosted, Linux] + if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-2-5-files != '[]' container: - image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v3-torch2.5.1 + image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5 options: --device /dev/dri --ipc=host volumes: - /dev/dri/by-path:/dev/dri/by-path @@ -529,6 +395,8 @@ jobs: nvcc --version echo "== torch ==" pip show torch + echo "== pip list ==" + pip list - name: Download wheel continue-on-error: true @@ -554,19 +422,24 @@ jobs: - name: Install wheel run: | - if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then - source /etc/profile.d/pyenv.sh && pyenv activate xpu - else - bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.5.1 3.11 - fi - if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then echo "===== install auto_round =====" uv pip install auto_round fi - echo "===== install dist/whl =====" + if [ "${{ matrix.test_script }}" == "models/test_cohere2" ]; then + echo "===== install transformers from git =====" + uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5 + fi + if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then + source /etc/profile.d/pyenv.sh && pyenv activate xpu + fi + echo "===== install dist/whl =====" uv pip install dist/*.whl + - name: Clean cache + if: always() + run: pip cache purge && uv cache clean + - name: Find suitable GPU if: ${{ !contains(matrix.test_script, 'ipex') && !cancelled() }} run: | @@ -597,12 +470,32 @@ jobs: pip uninstall vllm -y pip list fi - pytest --durations=0 tests/${{ matrix.test_script }}.py + + start_time=$(date +%s) + pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; } + execution_time=$(( $(date +%s) - start_time )) + echo "$((execution_time / 60))m $((execution_time % 60))s" + curl "http://${{ needs.check-vm.outputs.ip }}/gpu/vram?gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second" - name: Release GPU if: always() && !contains(matrix.test_script, 'ipex') run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}×tamp=${{ env.STEP_TIMESTAMP }}" - - name: Clean cache - if: always() - run: pip cache purge && uv cache clean + + m4: + runs-on: [self-hosted, m4] + needs: check-vm + steps: + - name: Checkout Codes + uses: actions/checkout@v4 + with: + repository: ${{ env.repo }} + ref: ${{ env.ref }} + + - name: Run test + run: | + export PATH="/opt/homebrew/bin:$PATH" && eval "$(pyenv init -)" + pyenv global 3.11.11 && python -m venv venv + curl -O http://${{ needs.check-vm.outputs.ip }}/scripts/m4/profile.sb + curl -O http://${{ needs.check-vm.outputs.ip }}/scripts/m4/run.sh + sandbox-exec -f profile.sb /bin/bash ./run.sh diff --git a/README.md b/README.md index 397b26444..36ccb648d 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@

## News +* 01/01/2025 [1.5.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.5.1): 🎉 2025! Added `QuantizeConfig.device` to clearly define which device is used for quantization: default = `auto`. Non-quantized models are always loaded on cpu by-default and each layer is moved to `QuantizeConfig.device` during quantization to minimize vram usage. Compatibility fixes for `attn_implementation_autoset` in latest transformers. * 12/23/2024 [1.5.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.5.0): Multi-modal (image-to-text) optimized quantization support has been added for Qwen 2-VL and Ovis 1.6-VL. Previous image-to-text model quantizations did not use image calibration data, resulting in less than optimal post-quantization results. Version 1.5.0 is the first release to provide a stable path for multi-modal quantization: only text layers are quantized. * 12/19/2024 [1.4.5](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.5): Windows 11 support added/validated. Ovis VL model support with image dataset calibration. Fixed `dynamic` loading. Reduced quantization vram usage. * 12/15/2024 [1.4.2](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.2): MacOS `gpu` (Metal) and `cpu` (M+) support added/validated for inference and quantization. Cohere 2 model support added. @@ -160,7 +161,8 @@ quant_config = QuantizeConfig(bits=4, group_size=128) model = GPTQModel.load(model_id, quant_config) -model.quantize(calibration_dataset) +# increase `batch_size` to match gpu/vram specs to speed up quantization +model.quantize(calibration_dataset, batch_size=2) model.save(quant_path) diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 5b2d5da33..2b0a3e38b 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -142,11 +142,14 @@ def load( quantize_config: Optional[QuantizeConfig | Dict] = None, device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None, device: Optional[Union[str, torch.device]] = None, - backend: BACKEND = BACKEND.AUTO, + backend: Union[str, BACKEND] = BACKEND.AUTO, trust_remote_code: bool = False, verify_hash: Optional[Union[str, List[str]]] = None, **kwargs, ): + if isinstance(backend, str): + backend = BACKEND(backend) + if backend == BACKEND.VLLM: from ..integration.integration_vllm import patch_vllm patch_vllm() @@ -219,7 +222,7 @@ def from_quantized( model_id_or_path: Optional[str], device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None, device: Optional[Union[str, int]] = None, - backend: BACKEND = BACKEND.AUTO, + backend: Union[str, BACKEND] = BACKEND.AUTO, trust_remote_code: bool = False, # verify weight files matches predefined hash during loading # usage: hash_format:hash_value, example: md5:ugkdh232 @@ -229,10 +232,8 @@ def from_quantized( ) -> BaseGPTQModel: model_type = check_and_get_model_type(model_id_or_path, trust_remote_code) - if backend == BACKEND.AUTO: - if not torch.cuda.is_available() and HAS_IPEX: - logger.warning("No cuda found, use IPEX backend") - backend = BACKEND.IPEX + if isinstance(backend, str): + backend = BACKEND(backend) return MODEL_MAP[model_type].from_quantized( model_id_or_path=model_id_or_path, diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index c4423f03f..4d51b30d8 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -333,7 +333,7 @@ def collate_batch(batch): iters=self.quantize_config.iters, lr=self.quantize_config.lr, minmax_lr=self.quantize_config.minmax_lr, enable_quanted_input=self.quantize_config.enable_quanted_input, - device=self.hf_device_map, + device=self.device, amp=self.quantize_config.amp, low_gpu_mem_usage=self.quantize_config.low_gpu_mem_usage, seed=self.quantize_config.seed, diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 25dad5ff7..76da650fa 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -155,6 +155,10 @@ def skip(*args, **kwargs): # enforce some values despite user specified # non-quantized models are always loaded into cpu model_init_kwargs["device_map"] = cpu_device_map + # if flash_attn was installed and _attn_implementation_autoset was None, flash attention would be loaded + # but device map is cpu, it will trow non-supported device error + if Version(transformers.__version__) >= Version("4.46.0"): + model_init_kwargs["_attn_implementation_autoset"] = True model_init_kwargs["torch_dtype"] = torch_dtype if config.model_type not in SUPPORTED_MODELS: @@ -190,7 +194,7 @@ def from_quantized( model_id_or_path: Optional[str], device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None, device: Optional[Union[str, int]] = None, - backend: BACKEND = BACKEND.AUTO, + backend: Union[str, BACKEND] = BACKEND.AUTO, torch_dtype: [str | torch.dtype] = "auto", trust_remote_code: bool = False, verify_hash: Optional[Union[str, List[str]]] = None, @@ -200,6 +204,8 @@ def from_quantized( device = normalize_device_device_map(device, device_map) # TODO need to normalize backend and others in a unified api + if isinstance(backend, str): + backend = BACKEND(backend) device = auto_select_device(device, backend) device_map = {"":device} @@ -306,7 +312,8 @@ def from_quantized( marlin_compatible = False if backend == BACKEND.IPEX else _validate_marlin_device_support() - if backend != BACKEND.MARLIN: + # check for marlin compat for cuda device onnly + if backend != BACKEND.MARLIN and device == DEVICE.CUDA: unsupported = _validate_marlin_compatibility(quantize_config) if unsupported is None and marlin_compatible: logger.info( diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py index 41faf4b47..ecb99d3b1 100644 --- a/gptqmodel/nn_modules/qlinear/ipex.py +++ b/gptqmodel/nn_modules/qlinear/ipex.py @@ -1,7 +1,6 @@ # License: GPTQModel/licenses/LICENSE.apache import math -import sys from typing import Optional, Tuple import numpy as np @@ -25,7 +24,8 @@ HAS_IPEX = False IPEX_ERROR_LOG = None try: - from intel_extension_for_pytorch.llm.quantization import IPEXWeightOnlyQuantizedLinear + from intel_extension_for_pytorch.llm.quantization import IPEXWeightOnlyQuantizedLinear, QuantDtype, QuantMethod + HAS_IPEX = True except BaseException: HAS_IPEX = False @@ -148,7 +148,7 @@ def init_ipex_linear(self, x: torch.Tensor): if not self.training and HAS_IPEX and not x.requires_grad: self.ipex_linear = IPEXWeightOnlyQuantizedLinear.from_weight(self.qweight, self.scales, self.qzeros, self.infeatures, self.outfeatures, None, self.bias, - self.group_size, self.g_idx, quant_method=0, dtype=0) + self.group_size, self.g_idx, quant_method=QuantMethod.GPTQ_GEMM, dtype=QuantDtype.INT4) def pack(self, linear, scales, zeros, g_idx=None): W = linear.weight.data.clone() diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py index 217cb3c8d..fcda7593a 100644 --- a/gptqmodel/utils/importer.py +++ b/gptqmodel/utils/importer.py @@ -95,6 +95,7 @@ def hf_select_quant_linear( sym: bool, checkpoint_format: str, meta: Optional[Dict[str, any]] = None, + pack: Optional[bool] = True, device_map: Optional[Union[str, dict]] = None, backend: Optional[Union[str, BACKEND]] = None, ) -> Type[BaseQuantLinear]: @@ -115,8 +116,8 @@ def hf_select_quant_linear( backend=backend, device=device, format=FORMAT.GPTQ, - pack=True, - allow_marlin=False, # TODO: remove this after marlin padding is fixed + pack=pack, + allow_marlin=True, # TODO: remove this after marlin padding is fixed dynamic=None, ) diff --git a/gptqmodel/version.py b/gptqmodel/version.py index 8550c79bf..cdf70847e 100644 --- a/gptqmodel/version.py +++ b/gptqmodel/version.py @@ -1 +1 @@ -__version__ = "1.5.1-dev" +__version__ = "1.5.2-dev" diff --git a/tests/models/test_glm.py b/tests/models/test_glm.py index 6925a1847..df9eb7366 100644 --- a/tests/models/test_glm.py +++ b/tests/models/test_glm.py @@ -6,6 +6,7 @@ class TestGlm(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/glm-4-9b-chat-hf" NATIVE_ARC_CHALLENGE_ACC = 0.5154 NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5316 + QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 USE_VLLM = False def test_glm(self): diff --git a/tests/models/test_gptbigcode.py b/tests/models/test_gptbigcode.py index a1d12a9a5..a7f7a79c1 100644 --- a/tests/models/test_gptbigcode.py +++ b/tests/models/test_gptbigcode.py @@ -1,10 +1,18 @@ +import importlib.util +import os + +# TODO: find how ipex registered it jit interpreter +# if intel_extension_for_pytorch was installed, @torch.jit.script in transformers/models/gpt_bigcode/modeling_gpt_bigcode.py will try to use ipex as torchScript interpreter. +# However, in quantization, tensor were on gpu, which will throw RuntimeError: itensor_view_from_dense expects CPU tensor input +if importlib.util.find_spec("intel_extension_for_pytorch"): + os.environ["PYTORCH_JIT"] = "False" import torch # noqa: E402 -from model_test import ModelTest +from model_test import ModelTest # noqa: E402 class TestGptBigCode(ModelTest): - NATIVE_MODEL_ID = "/monster/data/model/gpt_bigcode-santacoder" # "bigcode/gpt_bigcode-santacoder" + NATIVE_MODEL_ID = "/monster/data/model/gpt_bigcode-santacoder" # "bigcode/gpt_bigcode-santacoder" NATIVE_ARC_CHALLENGE_ACC = 0.1689 NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2056 TORCH_DTYPE = torch.float16 @@ -12,4 +20,3 @@ class TestGptBigCode(ModelTest): def test_gptbigcode(self): self.quant_lm_eval() - diff --git a/tests/models/test_xverse.py b/tests/models/test_xverse.py index 4e3a29a5e..a411f5a91 100644 --- a/tests/models/test_xverse.py +++ b/tests/models/test_xverse.py @@ -5,6 +5,7 @@ class TestXVerse(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/XVERSE-7B-Chat" # "xverse/XVERSE-7B-Chat" NATIVE_ARC_CHALLENGE_ACC = 0.4198 NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4044 + QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 TRUST_REMOTE_CODE = True APPLY_CHAT_TEMPLATE = True BATCH_SIZE = 6 diff --git a/tests/test_q4_torch.py b/tests/test_q4_torch.py index d55964771..a8a6b0352 100644 --- a/tests/test_q4_torch.py +++ b/tests/test_q4_torch.py @@ -20,17 +20,11 @@ class TestsQ4Torch(unittest.TestCase): @parameterized.expand( [ - (torch.float16, "mps"), (torch.bfloat16, "cpu"), (torch.float16, "cuda"), ] ) def test_generation_desc_act_true(self, torch_dtype, device): - if sys.platform == "darwin" and device not in ["cpu","mps"]: - self.skipTest(f"MacOS env skipping unsupported device `{device}`") - elif sys.platform == "linux" and device not in ["cpu", "cuda", "xpu"]: - self.skipTest(f"MacOS env skipping unsupported device `{device}`") - prompt = "I am in Paris and" # CPU implementation is extremely slow. @@ -69,18 +63,12 @@ def test_generation_desc_act_true(self, torch_dtype, device): @parameterized.expand( [ (torch.bfloat16, "cpu"), - (torch.float16, "mps"), (torch.float16, "cuda"), # TODO: pending pytorch fix https://github.com/pytorch/pytorch/issues/100932 # (torch.float16, "cpu"), ] ) def test_generation_desc_act_false(self, torch_dtype, device): - if sys.platform == "darwin" and device not in ["cpu","mps"]: - self.skipTest(f"MacOS env skipping unsupported device `{device}`") - elif sys.platform == "linux" and device not in ["cpu", "cuda", "xpu"]: - self.skipTest(f"MacOS env skipping unsupported device `{device}`") - prompt = "I am in Paris and" # CPU implementation is extremely slow. diff --git a/tests/test_q4_torch_apple.py b/tests/test_q4_torch_apple.py new file mode 100644 index 000000000..07fa9cf33 --- /dev/null +++ b/tests/test_q4_torch_apple.py @@ -0,0 +1,98 @@ +import sys # noqa: E402 +import unittest # noqa: E402 + +import torch # noqa: E402 +from parameterized import parameterized # noqa: E402 +from transformers import AutoTokenizer # noqa: E402 + +from gptqmodel import BACKEND, GPTQModel # noqa: E402 + + +GENERATE_EVAL_SIZE = 100 + + +class TestsQ4Torch(unittest.TestCase): + @parameterized.expand( + [ + (torch.float16, "mps"), + (torch.bfloat16, "cpu"), + ] + ) + def test_generation_desc_act_true(self, torch_dtype, device): + if sys.platform != "darwin": + self.skipTest(f"This test is macOS only") + + prompt = "I am in Paris and" + + # CPU implementation is extremely slow. + new_tokens = 5 + reference_output = " I am in Paris and I am in love with" + + model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" + revision = "desc_act_true" + + model_q = GPTQModel.from_quantized( + model_id, + revision=revision, + device=device, + backend=BACKEND.TORCH, + torch_dtype=torch_dtype, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + + inp = tokenizer(prompt, return_tensors="pt").to(device) + + # This one uses Autocast. + res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + predicted_text = tokenizer.decode(res[0]) + print("predicted_text", predicted_text) + print("reference_output", reference_output) + self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + + # This one does not. + res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + predicted_text = tokenizer.decode(res[0]) + print("predicted_text", predicted_text) + print("reference_output", reference_output) + self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + + @parameterized.expand( + [ + (torch.bfloat16, "cpu"), + (torch.float16, "mps"), + ] + ) + def test_generation_desc_act_false(self, torch_dtype, device): + if sys.platform != "darwin": + self.skipTest(f"This test is macOS only") + + prompt = "I am in Paris and" + + # CPU implementation is extremely slow. + new_tokens = 5 + reference_output = " I am in Paris and I am in love with" + + model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" + + model_q = GPTQModel.from_quantized( + model_id, + device=device, + backend=BACKEND.TORCH, + torch_dtype=torch_dtype, + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + + inp = tokenizer(prompt, return_tensors="pt").to(device) + + # This one uses Autocast. + res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + predicted_text = tokenizer.decode(res[0]) + + self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) + + # This one does not. + res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens) + predicted_text = tokenizer.decode(res[0]) + + self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])