diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index bb0791dc9..c2e67d9b1 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -22,13 +22,13 @@ on:
         required: false
         type: number
       target:
-        description: 'CUDA Torch Python version separated by space, check http://10.0.14.248/gpu/runner/docker to get all supported combinations'
+        description: 'CUDA Torch Python version separated by space, check http://10.0.13.31/gpu/runner/docker to get all supported combinations'
         required: false
         default: ''
       max-parallel:
         description: 'max parallel jobs'
         required: false
-        default: '12'
+        default: '10'
       upload_release:
         description: 'upload to release (it only works with a tag ref)'
         type: boolean
@@ -57,7 +57,7 @@ concurrency:
 
 jobs:
   check-vm:
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux]
     container:
       image: modelcloud/gptqmodel:alpine-ci-v1
     outputs:
@@ -82,7 +82,7 @@ jobs:
           echo "ip: $ip"
 
           max_p=${{ github.event.inputs.max-parallel }}
-          max_p="{\"size\": ${max_p:-12}}"
+          max_p="{\"size\": ${max_p:-10}}"
           echo "max-parallel=$max_p" >> "$GITHUB_OUTPUT"
           echo "max-parallel=$max_p"
 
@@ -121,10 +121,10 @@ jobs:
   release:
     strategy:
       fail-fast: false
-      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 12 }}
+      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }}
       matrix:
         tag: ${{ fromJSON(needs.check-vm.outputs.task_list) }}
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux]
     needs:
       - check-vm
     if: needs.check-vm.outputs.task_list != '' && !cancelled()
@@ -162,7 +162,7 @@ jobs:
           cuda_version=$(echo ${{ matrix.tag }} | grep -oP 'cuda\K[0-9.]+')
           torch_version=$(echo ${{ matrix.tag }} | grep -oP 'torch\K[0-9.]+')
           python_version=$(echo ${{ matrix.tag }} | grep -oP 'python\K[0-9.]+')
-          bash -c "$(curl -L http://${RUNNER}/files/scripts/init_compiler.sh)" @ $cuda_version $torch_version $python_version
+          bash -c "$(curl -L http://${RUNNER}/scripts/env/init_compiler.sh)" @ $cuda_version $torch_version $python_version
 
       - name: Compile
         run: python setup.py bdist_wheel
@@ -204,7 +204,7 @@ jobs:
           overwrite: true
 
   release-source:
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux]
     needs: 
       - check-vm
     container:
@@ -269,11 +269,12 @@ jobs:
       - name: Waiting for confirmation
         if: (github.event_name == 'release' || github.event.inputs.upload_pypi == 'true') && !cancelled()
         run: |
+          timestamp=$(date +%s%3N)
+
           echo "open http://${RUNNER}/gpu/ci/confirm?id=${{ github.run_id }}&timestamp=$timestamp&confirmed=1 to confirm releasing to pypi"
           for i in {1..5}; do echo "."; done
           echo "click http://${RUNNER}/gpu/ci/confirm?id=${{ github.run_id }}&timestamp=$timestamp&denied=1 to DENY"
 
-          timestamp=$(date +%s%3N)
           status=-1
 
           while [ "$status" -lt 0 ]; do
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 1f9d8b550..fb1f5d494 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -36,15 +36,21 @@ on:
         description: 'max parallel jobs'
         required: false
         default: '10'
+      m4-only:
+        description: 'only run m4(test only)'
+        type: boolean
+        required: false
+        default: false
 
 env:
   CUDA_DEVICE_ORDER: PCI_BUS_ID
   CUDA_VISIBLE_DEVICES: 0
+  TORCH_CUDA_ARCH_LIST: '8.9'
   MAX_JOBS: 8
   RUNNER: 10.0.14.248
   TRANSFORMERS_DIFF_TESTS: "models/test_internlm,models/test_internlm2_5,models/test_xverse"
   TORCH_2_5_TESTS: "test_evalplus,test_perplexity,test_q4_ipex.py,test_ipex_xpu.py,test_save_loaded_quantized_model,test_quant_formats,models/test_hymba"
-  IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral"
+  IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral,test_q4_torch_apple"
   GPTQMODEL_FORCE_BUILD: 1
   repo: ${{ github.event.inputs.repo || github.repository }}
   ref: ${{ github.event.inputs.ref || github.ref }}
@@ -55,7 +61,7 @@ concurrency:
 
 jobs:
   check-vm:
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux]
     container:
       image:  modelcloud/gptqmodel:alpine-ci-v1
     outputs:
@@ -92,9 +98,9 @@ jobs:
 
   list-test-files:
     runs-on: ubuntu-latest
+    if: github.event.inputs.m4-only != 'true'
     outputs:
       torch-2-5-files: ${{ steps.files.outputs.torch-2-5-files }}
-      gpu-files: ${{ steps.files.outputs.gpu-files }}
       transformers-files: ${{ steps.files.outputs.transformers-files }}
 
     steps:
@@ -121,44 +127,44 @@ jobs:
           import os
           import re
 
-          cpu_file_list = [f.strip().removesuffix('.py') for f in '${TORCH_2_5_TESTS}'.split(',') if f.strip()]
-          test_files_list = [f.strip().removesuffix('.py') for f in '${{ github.event.inputs.test_names }}'.split(',') if f.strip()]
-          cpu_test_files = [f for f in cpu_file_list if not test_files_list or f in test_files_list]
-          transformers_diff_list = [f.strip().removesuffix('.py') for f in '${TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()]
-          transformers_test_files = [f for f in transformers_diff_list if not test_files_list or f in test_files_list]
+          TRANSFORMERS_DIFF_TESTS = '${TRANSFORMERS_DIFF_TESTS}'
+          IGNORED_TEST_FILES = '${IGNORED_TEST_FILES}'
+
+          TEST_NAMES='${{ github.event.inputs.test_names }}'
+          TEST_REGEX='${{ github.event.inputs.test_regex }}'
+
+          input_test_files_list = [f.strip().removesuffix('.py') for f in TEST_NAMES.split(',') if f.strip()]
 
-          all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in '${IGNORED_TEST_FILES}']
+          transformers_test_files = [f.strip().removesuffix('.py') for f in f'{TRANSFORMERS_DIFF_TESTS}'.split(',') if f.strip()]
+          transformers_test_files = [f for f in transformers_test_files if not input_test_files_list or f in input_test_files_list]
 
-          all_tests_models = ['models/'+f.removesuffix('.py') for f in os.listdir('tests/models') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in '${IGNORED_TEST_FILES}']
-          all_tests = [item for item in all_tests+all_tests_models if item.strip().removesuffix('.py') not in '${TORCH_2_5_TESTS}']
+          all_tests = [f.removesuffix('.py') for f in os.listdir('tests/') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in f'{IGNORED_TEST_FILES}']
+          all_tests_models = ['models/'+f.removesuffix('.py') for f in os.listdir('tests/models') if f.startswith('test_') and f.endswith('.py') and f.strip().removesuffix('.py') not in f'{IGNORED_TEST_FILES}']
 
-          gpu_test_files = [f for f in all_tests if f not in cpu_file_list and (not test_files_list or f in test_files_list) and f not in transformers_diff_list]
+          torch_2_5_test_files = [f for f in all_tests+all_tests_models if (not input_test_files_list or f in input_test_files_list) and f not in transformers_test_files]
 
-          cpu_test_files = [test for test in cpu_test_files if re.match(r'${{ github.event.inputs.test_regex }}', test)]
-          gpu_test_files = [test for test in gpu_test_files if re.match(r'${{ github.event.inputs.test_regex }}', test)]
-          transformers_test_files = [test for test in transformers_test_files if re.match(r'${{ github.event.inputs.test_regex }}', test)]
+          torch_2_5_test_files = [test for test in torch_2_5_test_files if re.match(rf'{TEST_REGEX}', test)]
+          transformers_test_files = [test for test in transformers_test_files if re.match(rf'{TEST_REGEX}', test)]
 
-          print(f'{json.dumps(cpu_test_files)}|{json.dumps(gpu_test_files)}|{json.dumps(transformers_test_files)}')
+          print(f'{json.dumps(torch_2_5_test_files)}|{json.dumps(transformers_test_files)}')
           "
 
           test_files=$(python3 -c "$script")
-          IFS='|' read -r cpu_test_files gpu_test_files transformers_test_files <<< "$test_files"
-          echo "torch-2-5-files=$cpu_test_files" >> "$GITHUB_OUTPUT"
-          echo "gpu-files=$gpu_test_files" >> "$GITHUB_OUTPUT"
+          IFS='|' read -r torch_2_5_test_files transformers_test_files <<< "$test_files"
+          echo "torch-2-5-files=$torch_2_5_test_files" >> "$GITHUB_OUTPUT"
           echo "transformers-files=$transformers_test_files" >> "$GITHUB_OUTPUT"
 
           echo "Test files: $test_files"
-          echo "CPU Test files: $cpu_test_files"
-          echo "GPU Test files: $gpu_test_files"
+          echo "Torch 2.5 Test files: $torch_2_5_test_files"
           echo "Transformers Test files: $transformers_test_files"
           echo "Ignored Test files: $IGNORED_TEST_FILES"
 
   build:
-    runs-on: self-hosted
+    runs-on: [self-hosted, Linux]
     needs: check-vm
-    if: github.event.inputs.artifact_id == '' && !cancelled()
+    if: github.event.inputs.m4-only != 'true' && github.event.inputs.artifact_id == '' && !cancelled()
     container:
-      image:  ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.4.1
+      image:  ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5
     steps:
 
       - name: Checkout Codes
@@ -186,13 +192,8 @@ jobs:
           nvcc --version
           echo "== torch =="
           pip show torch
-
-      - name: Install requirements
-        run: |
-          echo "===== init test env ====="
-          bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
-          echo "===== install transformers typing-extensions ====="
-          uv pip install transformers typing-extensions -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          echo "##### pip list #####"
+          pip list
 
       - name: Compile
         timeout-minutes: 35
@@ -222,148 +223,6 @@ jobs:
           name: dist
           path: dist
 
-  torch2_4:
-    needs:
-      - build
-      - list-test-files
-      - check-vm
-    runs-on: self-hosted
-    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.gpu-files != '[]'
-    container:
-      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.4.1
-      volumes:
-        - /home/ci/models:/monster/data/model
-    strategy:
-      fail-fast: false
-      max-parallel: ${{ fromJson(needs.check-vm.outputs.max-parallel).size || 10 }}
-      matrix:
-        test_script: ${{ fromJSON(needs.list-test-files.outputs.gpu-files) }}
-
-    steps:
-      - name: Checkout Codes
-        uses: actions/checkout@v4
-        with:
-          repository: ${{ github.event.inputs.repo }}
-          ref: ${{ github.event.inputs.ref }}
-
-      - name: Fetch PR by number
-        if: ${{ github.event.inputs.pr_number != 0 }}
-        run: |
-          PR_NUMBER=${{ github.event.inputs.pr_number }}
-          echo "pr number $PR_NUMBER"
-          git config --global --add safe.directory $(pwd)
-          git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
-          git checkout pr-${PR_NUMBER}
-
-      - name: Print Env
-        run: |
-          echo "== pyenv =="
-          pyenv versions
-          echo "== python =="
-          python --version
-          echo "== nvcc =="
-          nvcc --version
-          echo "== torch =="
-          pip show torch
-
-          if [ -n "${{ github.event.inputs.artifact_id }}" ]; then
-            run_id="${{ github.event.inputs.artifact_id }}"
-          else
-            run_id="${{ github.run_id }}"
-          fi
-          echo "RUN_ID=$run_id" >> $GITHUB_ENV
-          echo "RUN_ID=${run_id}"
-
-          if grep -q "bitblas" tests/${{ matrix.test_script }}.py; then
-              echo "BITBLAS=1" >> $GITHUB_ENV
-          fi
-
-      - name: Download wheel
-        continue-on-error: true
-        run: |
-          file_name=$(curl -s  -F "runid=${{ needs.check-vm.outputs.run_id }}" -F "repo=${{ env.repo }}" -F "ref=${{ env.ref }}" -F "fuzz=1" "http://${{ needs.check-vm.outputs.ip }}/gpu/whl/download")
-
-          if echo "$file_name" | grep -q "gptqmodel"; then
-              mkdir dist || true
-              cd dist
-              curl -s -O  http://${{ needs.check-vm.outputs.ip }}/whl/${{ env.repo }}/${{ needs.check-vm.outputs.run_id }}/$file_name
-              ls -ahl .
-              sha256=$(sha256sum $file_name)
-              echo "sha256=$sha256"
-              echo "DOWNLOADED=1" >> $GITHUB_ENV
-          fi
-
-      - name: Download artifact
-        if: env.DOWNLOADED == '' && !cancelled()
-        uses: actions/download-artifact@v4
-        with:
-          name: dist
-          path: dist
-          run-id: ${{ needs.check-vm.outputs.run_id }}
-
-      - name: Install wheel
-        run: |
-          echo "===== install optimum bitblas ====="
-          uv pip install optimum bitblas==0.0.1.dev13 uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
-          echo "===== install dist/whl ====="
-          uv pip install dist/*.whl
-          if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
-            echo "===== install auto_round ====="
-            uv pip install auto_round
-          fi
-          bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
-          if [ "${{ matrix.test_script }}" == "models/test_cohere2" ]; then
-            echo "===== install transformers from git ====="
-            uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5
-          else
-            echo "===== install transformers from pypi ====="
-            uv pip install transformers -U
-          fi
-          echo "===== install typing-extensions numpy==1.26.4 ====="
-          uv pip install typing-extensions numpy==1.26.4 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
-
-      - name: Check platform
-        run: |
-          ip=${RUNNER}
-          echo "-----------"
-          pip show torch
-          echo "-----------"
-          nvcc --version
-
-      - name: Find suitable GPU
-        run: |
-          timestamp=$(date +%s%3N)
-          gpu_id=-1
-
-          while [ "$gpu_id" -lt 0 ]; do
-            gpu_id=$(curl -s "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp")
-
-            if [ "$gpu_id" -lt 0 ]; then
-              echo "http://${{ needs.check-vm.outputs.ip }}/gpu/get?id=${{ github.run_id }}&timestamp=$timestamp returned $gpu_id"
-              echo "No available GPU, waiting 5 seconds..."
-              sleep 5
-            else
-              echo "Allocated GPU ID: $gpu_id"
-            fi
-          done
-          echo "CUDA_VISIBLE_DEVICES=$gpu_id" >> $GITHUB_ENV
-          echo "STEP_TIMESTAMP=$timestamp" >> $GITHUB_ENV
-          echo "CUDA_VISIBLE_DEVICES set to $gpu_id, timestamp=$timestamp"
-
-      - name: Run tests
-        if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
-        run: pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; }
-
-      - name: Clear cache
-        if: failure() && env.BITBLAS == '1' && env.ERROR == '1'
-        run: |
-          rm -rf ~/.cache/bitblas/nvidia/geforce-rtx-4090
-          echo "clear bitblas cache"
-
-      - name: Release GPU
-        if: always()
-        run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
-
       - name: Clean cache
         if: always()
         run: pip cache purge && uv cache clean
@@ -373,10 +232,10 @@ jobs:
       - build
       - list-test-files
       - check-vm
-    runs-on: self-hosted
-    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.transformers-files != '[]'
+    runs-on: [self-hosted, Linux]
+    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.transformers-files != '[]'
     container:
-      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.4.1
+      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5
       volumes:
         - /home/ci/models:/monster/data/model
     strategy:
@@ -410,6 +269,8 @@ jobs:
           nvcc --version
           echo "== torch =="
           pip show torch
+          echo "== pip list =="
+          pip list
 
       - name: Download wheel
         continue-on-error: true
@@ -441,7 +302,6 @@ jobs:
           echo "===== install dist/whl ====="
           uv pip install dist/*.whl
           echo "===== init test env ====="
-          bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
           echo "===== install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 ====="
           uv pip install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
           if [ "${{ matrix.test_script }}" == "models/test_xverse" ]; then
@@ -453,7 +313,12 @@ jobs:
             uv pip install auto_round
           fi
 
+      - name: Clean cache
+        if: always()
+        run: pip cache purge && uv cache clean
+
       - name: Find suitable GPU
+        if: ${{ !contains(matrix.test_script, 'ipex') && !cancelled() }}
         run: |
           timestamp=$(date +%s%3N)
           gpu_id=-1
@@ -475,25 +340,26 @@ jobs:
 
       - name: Run tests
         if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
-        run: pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; }
+        run: |
+          start_time=$(date +%s)
+          pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; }
+          execution_time=$(( $(date +%s) - start_time ))
+          echo "$((execution_time / 60))m $((execution_time % 60))s"
+          curl "http://${{ needs.check-vm.outputs.ip }}/gpu/vram?gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second"
 
       - name: Release GPU
-        if: always()
+        if: always() && !contains(matrix.test_script, 'ipex')
         run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
 
-      - name: Clean cache
-        if: always()
-        run: pip cache purge && uv cache clean
-
   torch2_5:
     needs:
       - build
       - list-test-files
       - check-vm
-    runs-on: self-hosted
-    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.torch-2-5-files != '[]'
+    runs-on: [self-hosted, Linux]
+    if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && github.event.inputs.m4-only != 'true' && needs.list-test-files.outputs.torch-2-5-files != '[]'
     container:
-      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v3-torch2.5.1
+      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v5
       options: --device /dev/dri --ipc=host
       volumes:
         - /dev/dri/by-path:/dev/dri/by-path
@@ -529,6 +395,8 @@ jobs:
           nvcc --version
           echo "== torch =="
           pip show torch
+          echo "== pip list =="
+          pip list
 
       - name: Download wheel
         continue-on-error: true
@@ -554,19 +422,24 @@ jobs:
 
       - name: Install wheel
         run: |
-          if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
-           source /etc/profile.d/pyenv.sh && pyenv activate xpu
-          else
-            bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.5.1 3.11
-          fi
-
           if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
             echo "===== install auto_round ====="
             uv pip install auto_round
           fi
-            echo "===== install dist/whl ====="
+          if [ "${{ matrix.test_script }}" == "models/test_cohere2" ]; then
+            echo "===== install transformers from git ====="
+            uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5
+          fi
+          if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
+            source /etc/profile.d/pyenv.sh && pyenv activate xpu
+          fi
+          echo "===== install dist/whl ====="
           uv pip install dist/*.whl
 
+      - name: Clean cache
+        if: always()
+        run: pip cache purge && uv cache clean
+
       - name: Find suitable GPU
         if: ${{ !contains(matrix.test_script, 'ipex') && !cancelled() }}
         run: |
@@ -597,12 +470,32 @@ jobs:
             pip uninstall vllm -y
             pip list
           fi
-          pytest --durations=0 tests/${{ matrix.test_script }}.py
+
+          start_time=$(date +%s)
+          pytest --durations=0 tests/${{ matrix.test_script }}.py || { echo "ERROR=1" >> $GITHUB_ENV; exit 1; }
+          execution_time=$(( $(date +%s) - start_time ))
+          echo "$((execution_time / 60))m $((execution_time % 60))s"
+          curl "http://${{ needs.check-vm.outputs.ip }}/gpu/vram?gpu=${{ env.CUDA_VISIBLE_DEVICES }}&range=$execution_time&unit=second"
 
       - name: Release GPU
         if: always() && !contains(matrix.test_script, 'ipex')
         run: curl -X GET "http://${{ needs.check-vm.outputs.ip }}/gpu/release?id=${{ github.run_id }}&gpu=${{ env.CUDA_VISIBLE_DEVICES }}&timestamp=${{ env.STEP_TIMESTAMP }}"
 
-      - name: Clean cache
-        if: always()
-        run: pip cache purge && uv cache clean
+
+  m4:
+    runs-on: [self-hosted, m4]
+    needs: check-vm
+    steps:
+      - name: Checkout Codes
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ env.repo }}
+          ref: ${{ env.ref }}
+
+      - name: Run test
+        run: |
+          export PATH="/opt/homebrew/bin:$PATH" && eval "$(pyenv init -)"
+          pyenv global 3.11.11 && python -m venv venv
+          curl -O http://${{ needs.check-vm.outputs.ip }}/scripts/m4/profile.sb
+          curl -O http://${{ needs.check-vm.outputs.ip }}/scripts/m4/run.sh
+          sandbox-exec -f profile.sb /bin/bash ./run.sh
diff --git a/README.md b/README.md
index 397b26444..36ccb648d 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@
 </p>
 
 ## News
+* 01/01/2025 [1.5.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.5.1): 🎉 2025! Added `QuantizeConfig.device` to clearly define which device is used for quantization: default = `auto`. Non-quantized models are always loaded on cpu by-default and each layer is moved to `QuantizeConfig.device` during quantization to minimize vram usage. Compatibility fixes for `attn_implementation_autoset` in latest transformers. 
 * 12/23/2024 [1.5.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.5.0): Multi-modal (image-to-text) optimized quantization support has been added for Qwen 2-VL and Ovis 1.6-VL. Previous image-to-text model quantizations did not use image calibration data, resulting in less than optimal post-quantization results. Version 1.5.0 is the first release to provide a stable path for multi-modal quantization: only text layers are quantized.
 * 12/19/2024 [1.4.5](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.5): Windows 11 support added/validated. Ovis VL model support with image dataset calibration. Fixed `dynamic` loading. Reduced quantization vram usage. 
 * 12/15/2024 [1.4.2](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.2): MacOS `gpu` (Metal) and `cpu` (M+) support added/validated for inference and quantization. Cohere 2 model support added. 
@@ -160,7 +161,8 @@ quant_config = QuantizeConfig(bits=4, group_size=128)
 
 model = GPTQModel.load(model_id, quant_config)
 
-model.quantize(calibration_dataset)
+# increase `batch_size` to match gpu/vram specs to speed up quantization
+model.quantize(calibration_dataset, batch_size=2)
 
 model.save(quant_path)
 
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index 5b2d5da33..2b0a3e38b 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -142,11 +142,14 @@ def load(
             quantize_config: Optional[QuantizeConfig | Dict] = None,
             device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
             device: Optional[Union[str, torch.device]] = None,
-            backend: BACKEND = BACKEND.AUTO,
+            backend: Union[str, BACKEND] = BACKEND.AUTO,
             trust_remote_code: bool = False,
             verify_hash: Optional[Union[str, List[str]]] = None,
             **kwargs,
     ):
+        if isinstance(backend, str):
+            backend = BACKEND(backend)
+
         if backend == BACKEND.VLLM:
             from ..integration.integration_vllm import patch_vllm
             patch_vllm()
@@ -219,7 +222,7 @@ def from_quantized(
         model_id_or_path: Optional[str],
         device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
         device: Optional[Union[str, int]] = None,
-        backend: BACKEND = BACKEND.AUTO,
+        backend: Union[str, BACKEND] = BACKEND.AUTO,
         trust_remote_code: bool = False,
         # verify weight files matches predefined hash during loading
         # usage: hash_format:hash_value, example: md5:ugkdh232
@@ -229,10 +232,8 @@ def from_quantized(
     ) -> BaseGPTQModel:
         model_type = check_and_get_model_type(model_id_or_path, trust_remote_code)
 
-        if backend == BACKEND.AUTO:
-            if not torch.cuda.is_available() and HAS_IPEX:
-                logger.warning("No cuda found, use IPEX backend")
-                backend = BACKEND.IPEX
+        if isinstance(backend, str):
+            backend = BACKEND(backend)
 
         return MODEL_MAP[model_type].from_quantized(
             model_id_or_path=model_id_or_path,
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index c4423f03f..4d51b30d8 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -333,7 +333,7 @@ def collate_batch(batch):
                                        iters=self.quantize_config.iters, lr=self.quantize_config.lr,
                                        minmax_lr=self.quantize_config.minmax_lr,
                                        enable_quanted_input=self.quantize_config.enable_quanted_input,
-                                       device=self.hf_device_map,
+                                       device=self.device,
                                        amp=self.quantize_config.amp,
                                        low_gpu_mem_usage=self.quantize_config.low_gpu_mem_usage,
                                        seed=self.quantize_config.seed,
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
index 25dad5ff7..76da650fa 100644
--- a/gptqmodel/models/loader.py
+++ b/gptqmodel/models/loader.py
@@ -155,6 +155,10 @@ def skip(*args, **kwargs):
         # enforce some values despite user specified
         # non-quantized models are always loaded into cpu
         model_init_kwargs["device_map"] = cpu_device_map
+        # if flash_attn was installed and _attn_implementation_autoset was None, flash attention would be loaded
+        # but device map is cpu, it will trow non-supported device error
+        if Version(transformers.__version__) >= Version("4.46.0"):
+            model_init_kwargs["_attn_implementation_autoset"] = True
         model_init_kwargs["torch_dtype"] = torch_dtype
 
         if config.model_type not in SUPPORTED_MODELS:
@@ -190,7 +194,7 @@ def from_quantized(
             model_id_or_path: Optional[str],
             device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
             device: Optional[Union[str, int]] = None,
-            backend: BACKEND = BACKEND.AUTO,
+            backend: Union[str, BACKEND] = BACKEND.AUTO,
             torch_dtype: [str | torch.dtype] = "auto",
             trust_remote_code: bool = False,
             verify_hash: Optional[Union[str, List[str]]] = None,
@@ -200,6 +204,8 @@ def from_quantized(
         device = normalize_device_device_map(device, device_map)
 
         # TODO need to normalize backend and others in a unified api
+        if isinstance(backend, str):
+            backend = BACKEND(backend)
         device = auto_select_device(device, backend)
         device_map = {"":device}
 
@@ -306,7 +312,8 @@ def from_quantized(
 
         marlin_compatible = False if backend == BACKEND.IPEX else _validate_marlin_device_support()
 
-        if backend != BACKEND.MARLIN:
+        # check for marlin compat for cuda device onnly
+        if backend != BACKEND.MARLIN and device == DEVICE.CUDA:
             unsupported = _validate_marlin_compatibility(quantize_config)
             if unsupported is None and marlin_compatible:
                 logger.info(
diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py
index 41faf4b47..ecb99d3b1 100644
--- a/gptqmodel/nn_modules/qlinear/ipex.py
+++ b/gptqmodel/nn_modules/qlinear/ipex.py
@@ -1,7 +1,6 @@
 # License: GPTQModel/licenses/LICENSE.apache
 
 import math
-import sys
 from typing import Optional, Tuple
 
 import numpy as np
@@ -25,7 +24,8 @@
 HAS_IPEX = False
 IPEX_ERROR_LOG = None
 try:
-    from intel_extension_for_pytorch.llm.quantization import IPEXWeightOnlyQuantizedLinear
+    from intel_extension_for_pytorch.llm.quantization import IPEXWeightOnlyQuantizedLinear, QuantDtype, QuantMethod
+
     HAS_IPEX = True
 except BaseException:
     HAS_IPEX = False
@@ -148,7 +148,7 @@ def init_ipex_linear(self, x: torch.Tensor):
         if not self.training and HAS_IPEX and not x.requires_grad:
             self.ipex_linear = IPEXWeightOnlyQuantizedLinear.from_weight(self.qweight, self.scales, self.qzeros,
                                                                     self.infeatures, self.outfeatures, None, self.bias,
-                                                                    self.group_size, self.g_idx, quant_method=0, dtype=0)
+                                                                    self.group_size, self.g_idx, quant_method=QuantMethod.GPTQ_GEMM, dtype=QuantDtype.INT4)
 
     def pack(self, linear, scales, zeros, g_idx=None):
         W = linear.weight.data.clone()
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
index 217cb3c8d..fcda7593a 100644
--- a/gptqmodel/utils/importer.py
+++ b/gptqmodel/utils/importer.py
@@ -95,6 +95,7 @@ def hf_select_quant_linear(
         sym: bool,
         checkpoint_format: str,
         meta: Optional[Dict[str, any]] = None,
+        pack: Optional[bool] = True,
         device_map: Optional[Union[str, dict]] = None,
         backend: Optional[Union[str, BACKEND]] = None,
 ) -> Type[BaseQuantLinear]:
@@ -115,8 +116,8 @@ def hf_select_quant_linear(
         backend=backend,
         device=device,
         format=FORMAT.GPTQ,
-        pack=True,
-        allow_marlin=False, # TODO: remove this after marlin padding is fixed
+        pack=pack,
+        allow_marlin=True, # TODO: remove this after marlin padding is fixed
         dynamic=None,
     )
 
diff --git a/gptqmodel/version.py b/gptqmodel/version.py
index 8550c79bf..cdf70847e 100644
--- a/gptqmodel/version.py
+++ b/gptqmodel/version.py
@@ -1 +1 @@
-__version__ = "1.5.1-dev"
+__version__ = "1.5.2-dev"
diff --git a/tests/models/test_glm.py b/tests/models/test_glm.py
index 6925a1847..df9eb7366 100644
--- a/tests/models/test_glm.py
+++ b/tests/models/test_glm.py
@@ -6,6 +6,7 @@ class TestGlm(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/glm-4-9b-chat-hf"
     NATIVE_ARC_CHALLENGE_ACC = 0.5154
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5316
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2
     USE_VLLM = False
 
     def test_glm(self):
diff --git a/tests/models/test_gptbigcode.py b/tests/models/test_gptbigcode.py
index a1d12a9a5..a7f7a79c1 100644
--- a/tests/models/test_gptbigcode.py
+++ b/tests/models/test_gptbigcode.py
@@ -1,10 +1,18 @@
+import importlib.util
+import os
+
+# TODO: find how ipex registered it jit interpreter
+# if intel_extension_for_pytorch was installed, @torch.jit.script in transformers/models/gpt_bigcode/modeling_gpt_bigcode.py will try to use ipex as torchScript interpreter.
+# However, in quantization, tensor were on gpu, which will throw RuntimeError: itensor_view_from_dense expects CPU tensor input
+if importlib.util.find_spec("intel_extension_for_pytorch"):
+    os.environ["PYTORCH_JIT"] = "False"
 
 import torch  # noqa: E402
-from model_test import ModelTest
+from model_test import ModelTest  # noqa: E402
 
 
 class TestGptBigCode(ModelTest):
-    NATIVE_MODEL_ID = "/monster/data/model/gpt_bigcode-santacoder" # "bigcode/gpt_bigcode-santacoder"
+    NATIVE_MODEL_ID = "/monster/data/model/gpt_bigcode-santacoder"  # "bigcode/gpt_bigcode-santacoder"
     NATIVE_ARC_CHALLENGE_ACC = 0.1689
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2056
     TORCH_DTYPE = torch.float16
@@ -12,4 +20,3 @@ class TestGptBigCode(ModelTest):
 
     def test_gptbigcode(self):
         self.quant_lm_eval()
-
diff --git a/tests/models/test_xverse.py b/tests/models/test_xverse.py
index 4e3a29a5e..a411f5a91 100644
--- a/tests/models/test_xverse.py
+++ b/tests/models/test_xverse.py
@@ -5,6 +5,7 @@ class TestXVerse(ModelTest):
     NATIVE_MODEL_ID = "/monster/data/model/XVERSE-7B-Chat" # "xverse/XVERSE-7B-Chat"
     NATIVE_ARC_CHALLENGE_ACC = 0.4198
     NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4044
+    QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2
     TRUST_REMOTE_CODE = True
     APPLY_CHAT_TEMPLATE = True
     BATCH_SIZE = 6
diff --git a/tests/test_q4_torch.py b/tests/test_q4_torch.py
index d55964771..a8a6b0352 100644
--- a/tests/test_q4_torch.py
+++ b/tests/test_q4_torch.py
@@ -20,17 +20,11 @@
 class TestsQ4Torch(unittest.TestCase):
     @parameterized.expand(
         [
-            (torch.float16, "mps"),
             (torch.bfloat16, "cpu"),
             (torch.float16, "cuda"),
         ]
     )
     def test_generation_desc_act_true(self, torch_dtype, device):
-        if sys.platform == "darwin" and device not in ["cpu","mps"]:
-            self.skipTest(f"MacOS env skipping unsupported device `{device}`")
-        elif sys.platform == "linux" and device not in ["cpu", "cuda", "xpu"]:
-            self.skipTest(f"MacOS env skipping unsupported device `{device}`")
-
         prompt = "I am in Paris and"
 
         # CPU implementation is extremely slow.
@@ -69,18 +63,12 @@ def test_generation_desc_act_true(self, torch_dtype, device):
     @parameterized.expand(
         [
             (torch.bfloat16, "cpu"),
-            (torch.float16, "mps"),
             (torch.float16, "cuda"),
             # TODO: pending pytorch fix https://github.com/pytorch/pytorch/issues/100932
             # (torch.float16, "cpu"),
         ]
     )
     def test_generation_desc_act_false(self, torch_dtype, device):
-        if sys.platform == "darwin" and device not in ["cpu","mps"]:
-            self.skipTest(f"MacOS env skipping unsupported device `{device}`")
-        elif sys.platform == "linux" and device not in ["cpu", "cuda", "xpu"]:
-            self.skipTest(f"MacOS env skipping unsupported device `{device}`")
-
         prompt = "I am in Paris and"
 
         # CPU implementation is extremely slow.
diff --git a/tests/test_q4_torch_apple.py b/tests/test_q4_torch_apple.py
new file mode 100644
index 000000000..07fa9cf33
--- /dev/null
+++ b/tests/test_q4_torch_apple.py
@@ -0,0 +1,98 @@
+import sys  # noqa: E402
+import unittest  # noqa: E402
+
+import torch  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
+
+GENERATE_EVAL_SIZE = 100
+
+
+class TestsQ4Torch(unittest.TestCase):
+    @parameterized.expand(
+        [
+            (torch.float16, "mps"),
+            (torch.bfloat16, "cpu"),
+        ]
+    )
+    def test_generation_desc_act_true(self, torch_dtype, device):
+        if sys.platform != "darwin":
+            self.skipTest(f"This test is macOS only")
+
+        prompt = "I am in Paris and"
+
+        # CPU implementation is extremely slow.
+        new_tokens = 5
+        reference_output = "<s> I am in Paris and I am in love with"
+
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+        revision = "desc_act_true"
+
+        model_q = GPTQModel.from_quantized(
+            model_id,
+            revision=revision,
+            device=device,
+            backend=BACKEND.TORCH,
+            torch_dtype=torch_dtype,
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        inp = tokenizer(prompt, return_tensors="pt").to(device)
+
+        # This one uses Autocast.
+        res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        predicted_text = tokenizer.decode(res[0])
+        print("predicted_text", predicted_text)
+        print("reference_output", reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+
+        # This one does not.
+        res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        predicted_text = tokenizer.decode(res[0])
+        print("predicted_text", predicted_text)
+        print("reference_output", reference_output)
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+
+    @parameterized.expand(
+        [
+            (torch.bfloat16, "cpu"),
+            (torch.float16, "mps"),
+        ]
+    )
+    def test_generation_desc_act_false(self, torch_dtype, device):
+        if sys.platform != "darwin":
+            self.skipTest(f"This test is macOS only")
+
+        prompt = "I am in Paris and"
+
+        # CPU implementation is extremely slow.
+        new_tokens = 5
+        reference_output = "<s> I am in Paris and I am in love with"
+
+        model_id = "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
+
+        model_q = GPTQModel.from_quantized(
+            model_id,
+            device=device,
+            backend=BACKEND.TORCH,
+            torch_dtype=torch_dtype,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        inp = tokenizer(prompt, return_tensors="pt").to(device)
+
+        # This one uses Autocast.
+        res = model_q.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        predicted_text = tokenizer.decode(res[0])
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])
+
+        # This one does not.
+        res = model_q.model.generate(**inp, num_beams=1, min_new_tokens=new_tokens, max_new_tokens=new_tokens)
+        predicted_text = tokenizer.decode(res[0])
+
+        self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE])