diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index 0ef843967..811b6149d 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -44,7 +44,7 @@ env:
   MAX_JOBS: 8
   RUNNER: 10.0.14.248
   TRANSFORMERS_DIFF_TESTS: "models/test_internlm,models/test_internlm2_5,models/test_xverse"
-  TORCH_2_5_TESTS: "test_q4_ipex.py,test_save_loaded_quantized_model,test_quant_formats,models/test_hymba"
+  TORCH_2_5_TESTS: "test_q4_ipex.py,test_ipex_xpu.py,test_save_loaded_quantized_model,test_quant_formats,models/test_hymba"
   IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral"
   GPTQMODEL_FORCE_BUILD: 1
   repo: ${{ github.event.inputs.repo || github.repository }}
@@ -190,7 +190,9 @@ jobs:
 
       - name: Install requirements
         run: |
+          echo "===== init test env ====="
           bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
+          echo "===== install transformers typing-extensions ====="
           uv pip install transformers typing-extensions -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
 
       - name: Compile
@@ -302,20 +304,24 @@ jobs:
 
       - name: Install wheel
         run: |
-          uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          echo "===== install optimum bitblas ====="
+          uv pip install optimum bitblas==0.0.1.dev13 uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          echo "===== install dist/whl ====="
           uv pip install dist/*.whl
           if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
+            echo "===== install auto_round ====="
             uv pip install auto_round
           fi
           bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
-          uv pip install typing-extensions numpy==1.26.4 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
           if [ "${{ matrix.test_script }}" == "test_cohere2" ]; then
+            echo "===== install transformers from git ====="
             uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5
           else
+            echo "===== install transformers from pypi ====="
             uv pip install transformers -U
           fi
-
-
+          echo "===== install typing-extensions numpy==1.26.4 ====="
+          uv pip install typing-extensions numpy==1.26.4 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
 
       - name: Check platform
         run: |
@@ -427,14 +433,20 @@ jobs:
 
       - name: Install wheel
         run: |
+          echo "===== install optimum bitblas parameterized uvicorn ====="
           uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          echo "===== install dist/whl ====="
           uv pip install dist/*.whl
+          echo "===== init test env ====="
           bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
+          echo "===== install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 ====="
           uv pip install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
           if [ "${{ matrix.test_script }}" = "test_xverse" ]; then
+            echo "===== install tokenizers==0.15.2 ====="
             uv pip install tokenizers==0.15.2 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
           fi
           if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
+            echo "===== install auto_round ====="
             uv pip install auto_round
           fi
 
@@ -474,8 +486,10 @@ jobs:
     runs-on: self-hosted
     if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.torch-2-5-files != '[]'
     container:
-      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.5.1
+      image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v3-torch2.5.1
+      options: --device /dev/dri --ipc=host
       volumes:
+        - /dev/dri/by-path:/dev/dri/by-path
         - /home/ci/models:/monster/data/model
     strategy:
       fail-fast: false
@@ -533,11 +547,17 @@ jobs:
 
       - name: Install wheel
         run: |
-          bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.5.1 3.11
-          uv pip install -U intel_extension_for_pytorch typing-extensions bitblas==0.0.1.dev13 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
+          if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
+           source /etc/profile.d/pyenv.sh && pyenv activate xpu
+          else
+            bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.5.1 3.11
+          fi
+
           if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
+            echo "===== install auto_round ====="
             uv pip install auto_round
           fi
+            echo "===== install dist/whl ====="
           uv pip install dist/*.whl
 
       - name: Find suitable GPU
@@ -562,7 +582,16 @@ jobs:
 
       - name: Run tests
         if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
-        run: pytest --durations=0 tests/${{ matrix.test_script }}.py
+        run: |
+          if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
+            export CUDA_VISIBLE_DEVICES=""
+            source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+            source $ONEAPI_ROOT/../pti/0.9/env/vars.sh
+            export Pti_DIR=$ONEAPI_ROOT/../pti/0.9/lib/cmake/pti
+            source /etc/profile.d/pyenv.sh && pyenv activate xpu
+            pip list
+          fi
+          pytest --durations=0 tests/${{ matrix.test_script }}.py
 
       - name: Release GPU
         if: always()
diff --git a/README.md b/README.md
index cd83c98c2..758f3f9dd 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@
 </p>
 
 ## News
+* 12/16/2024 1.4.5-dev: Windows 11 support added/validated. Fix `dynamic` loading. 
 * 12/15/2024 [1.4.2](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.2): MacOS `gpu` (Metal) and `cpu` (M+) support added/validated for inference and quantization. Cohere 2 model support added. 
 * 12/13/2024 [1.4.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.1): Added Qwen2-VL model support. `mse` quantization control exposed in `QuantizeConfig`. Monkey patch `patch_vllm()` and `patch_hf()` api added to allow Transformers/Optimum/PEFT and vLLM to correctly loaded GPTQModel quantized models while upstream PRs are in pending status. 
 * 12/10/2024 [1.4.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.0) `EvalPlus` harness integration merged upstream. We now support both `lm-eval` and `EvalPlus`. Added pure torch `Torch` kernel. Refactored `Cuda` kernel to be `DynamicCuda` kernel. `Triton` kernel now auto-padded for max model support. `Dynamic` quantization now supports both positive `+:`:default, and `-:` negative matching which allows matched modules to be skipped entirely for quantization. Fixed auto-`Marlin` kerenl selection. Added auto-kernel fallback for unsupported kernel/module pairs. Lots of internal refractor and cleanup in-preparation for transformers/optimum/peft upstream PR merge. Deprecated the saving of `Marlin` weight format since `Marlin` supports auto conversion of `gptq` format to `Marlin` during runtime. 
@@ -16,12 +17,14 @@
 * 11/29/2024 [1.3.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.3.1) Olmo2 model support. Intel XPU acceleration via IPEX. Model sharding Transformer compat fix due to api deprecation in HF. Removed triton dependency. Triton kernel now optionally dependent on triton pkg. 
 * 11/26/2024 [1.3.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.3.0) Zero-Day Hymba model support. Removed `tqdm` and `rogue` dependency. 
 * 11/24/2024 [1.2.3](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.2.3) HF GLM model support. ClearML logging integration. Use `device-smi` and replace `gputil` + `psutil` depends. Fixed model unit tests. 
-* 11/11/2024 🚀 [1.2.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.2.1) Meta MobileLLM model support added. `lm-eval[gptqmodel]` integration merged upstream. Intel/IPEX cpu inference merged replacing QBits (deprecated). Auto-fix/patch ChatGLM-3/GLM-4 compat with latest transformers. New `.load()` and `.save()` api. 
-* 10/29/2024 🚀 [1.1.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.1.0) IBM Granite model support. Full auto-buildless wheel install from pypi. Reduce max cpu memory usage by >20% during quantization. 100% CI model/feature coverage. 
 
 <details>
     
 <summary>Archived News:</summary>
+* 11/11/2024 🚀 [1.2.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.2.1) Meta MobileLLM model support added. `lm-eval[gptqmodel]` integration merged upstream. Intel/IPEX cpu inference merged replacing QBits (deprecated). Auto-fix/patch ChatGLM-3/GLM-4 compat with latest transformers. New `.load()` and `.save()` api. 
+
+* 10/29/2024 🚀 [1.1.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.1.0) IBM Granite model support. Full auto-buildless wheel install from pypi. Reduce max cpu memory usage by >20% during quantization. 100% CI model/feature coverage. 
+
 * 10/12/2024 ✨ [1.0.9](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.0.9) Move AutoRound to optional and fix pip install regression in v1.0.8.
 
 * 10/11/2024 ✨ [1.0.8](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.0.8) Add wheel for python 3.12 and cuda 11.8.
@@ -61,6 +64,7 @@ Public tests/papers and ModelCloud's internal tests have shown that GPTQ is on-p
 
 ## Features
 * 🚀 Extensive model support for: `Llama 1-3.3`, `Qwen2-VL`, `Olmo2`, `Hymba`, `GLM`, `IBM Granite`, `Llama 3.2 Vision`, `MiniCPM3`, `GRIN-Moe`, `Phi 1-4`, `EXAONE 3.0`, `InternLM 2.5`, `Gemma 2`, `DeepSeek-V2`, `DeepSeek-V2-Lite`, `ChatGLM`, `MiniCPM`, `Qwen2MoE`, `DBRX`.
+* ✨ Linux, MacOS, Windows platform quantization and accelerated inference support.
 * 💯 100% CI unit-test coverage for all supported models and kernels including post-quantization quality regression.
 * ✨ `Dynamic`/Mixed quantization control on a per-module basis. Each layer/module can have a unique quantization config or be excluded from quantization all together. 
 * 🚀 [vLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang) inference integration for quantized model where format = `FORMAT.GPTQ` 
@@ -79,7 +83,9 @@ Public tests/papers and ModelCloud's internal tests have shown that GPTQ is on-p
 ## Quality: GPTQModel 4bit can match BF16:
 🤗 [ModelCloud quantized ultra-high recovery vortex-series models on HF](https://huggingface.co/collections/ModelCloud/vortex-673743382af0a52b2a8b9fe2)
 
-![image](https://github.com/user-attachments/assets/aab69119-f9c8-4c94-9634-a3c63e57095e)
+![image](https://github.com/user-attachments/assets/7b2db012-b8af-4d19-a25d-7023cef19220)
+
+
 
 ## Model Support:  🚀 (Added by GPTQModel) 
 | Model            |     |                |     |                  |     |            |     |     |
@@ -96,17 +102,17 @@ Public tests/papers and ModelCloud's internal tests have shown that GPTQ is on-p
 | EXAONE 3.0       | 🚀  | InternLM 1/2.5 | 🚀  | OPT              | ✅   | Yi         | ✅   |     |
 
 
-## Kernel and HW Accelerator Support 
+## Platform and HW Support 
 
-GPTQModel is validated for Linux x86_64 with the following devices:
+GPTQModel is validated for Linux, MacOS, and Windows 11:
 
 | Platform        | Device        |     |  Optimized Arch              |  Kernels |
 |-----------------|---------------| --- | -------------- | -------------- | 
 | Linux           | Nvidia GPU    | ✅       | Ampere or Higher | Marlin, Exllama V2, Exallma V1, Triton, DyanamicCuda, Torch |
 | Linux           | Intel/AMD CPU | ✅          | `avx512` or `amx` | IPEX, Torch |
 | Linux | Intel XPU     | ✅             |   Intel Arc + Datacenter Max | IPEX, Torch |
-| MacOS | GPU (Metal) and CPU           | ✅             |   M1+ | Torch |
-
+| MacOS | GPU (Metal) / CPU          | ✅             |   M1+ | Torch |
+| Windows 11 | GPU (Nvidia) / CPU       | ✅             |   Nvidia  | DynamicCuda, Torch  |
 
 ## Install
 
diff --git a/examples/benchmark/generation_speed.py b/examples/benchmark/generation_speed.py
index 5bfc61db2..10c957896 100644
--- a/examples/benchmark/generation_speed.py
+++ b/examples/benchmark/generation_speed.py
@@ -7,11 +7,13 @@
 
 import torch
 from datasets import Dataset, load_dataset
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
-from gptqmodel.utils.progress import ProgressBar
 from transformers import AutoTokenizer, GenerationConfig
 from transformers.generation.logits_process import LogitsProcessor
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
+from gptqmodel.utils.progress import ProgressBar
+
+
 logger = logging.getLogger(__name__)
 
 random.seed(0)
diff --git a/examples/benchmark/ipex.py b/examples/benchmark/ipex.py
index 1fed35bef..753858d52 100644
--- a/examples/benchmark/ipex.py
+++ b/examples/benchmark/ipex.py
@@ -4,6 +4,7 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
+
 try:
     from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
     bind_cores_for_best_perf()
@@ -13,6 +14,7 @@
 
 import argparse
 
+
 parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
 parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
 parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")
diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
index 8d6c21d36..ca045ce98 100644
--- a/examples/benchmark/perplexity.py
+++ b/examples/benchmark/perplexity.py
@@ -2,9 +2,11 @@
 import os
 
 import torch
-from gptqmodel.utils import Perplexity
 from transformers import AutoTokenizer
 
+from gptqmodel.utils import Perplexity
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 if __name__ == "__main__":
@@ -51,7 +53,7 @@
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
     if args.is_quantized:
-        from gptqmodel import GPTQModel, BACKEND
+        from gptqmodel import BACKEND, GPTQModel
 
         model = GPTQModel.load(
             args.model_name,
diff --git a/examples/evaluation/run_language_modeling_task.py b/examples/evaluation/run_language_modeling_task.py
index fb33e1d94..ed384215d 100644
--- a/examples/evaluation/run_language_modeling_task.py
+++ b/examples/evaluation/run_language_modeling_task.py
@@ -2,12 +2,13 @@
 
 import datasets
 import torch
-from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
-from gptqmodel.eval_tasks import LanguageModelingTask
 from transformers import AutoTokenizer
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
+from gptqmodel.eval_tasks import LanguageModelingTask
 from gptqmodel.utils.torch import torch_empty_cache
 
+
 DATASET = "tatsu-lab/alpaca"
 WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
 WITHOUT_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nOutput:\n"
diff --git a/examples/evaluation/run_sequence_classification_task.py b/examples/evaluation/run_sequence_classification_task.py
index 489914fa0..f3344c858 100644
--- a/examples/evaluation/run_sequence_classification_task.py
+++ b/examples/evaluation/run_sequence_classification_task.py
@@ -3,12 +3,13 @@
 
 import datasets
 import torch
-from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
-from gptqmodel.eval_tasks import SequenceClassificationTask
 from transformers import AutoTokenizer
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
+from gptqmodel.eval_tasks import SequenceClassificationTask
 from gptqmodel.utils.torch import torch_empty_cache
 
+
 DATASET = "cardiffnlp/tweet_sentiment_multilingual"
 TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
 ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"}
diff --git a/examples/evaluation/run_text_summarization_task.py b/examples/evaluation/run_text_summarization_task.py
index ae44fe7ec..2357baebe 100644
--- a/examples/evaluation/run_text_summarization_task.py
+++ b/examples/evaluation/run_text_summarization_task.py
@@ -3,12 +3,13 @@
 
 import datasets
 import torch
-from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
-from gptqmodel.eval_tasks import TextSummarizationTask
 from transformers import AutoTokenizer, GenerationConfig
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
+from gptqmodel.eval_tasks import TextSummarizationTask
 from gptqmodel.utils.torch import torch_empty_cache
 
+
 os.system("pip install py7zr")
 
 
diff --git a/examples/inference/run_transformers.py b/examples/inference/run_transformers.py
index 348515d3a..077dc25ea 100644
--- a/examples/inference/run_transformers.py
+++ b/examples/inference/run_transformers.py
@@ -1,5 +1,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+
 tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
 print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
diff --git a/examples/inference/run_with_different_backends.py b/examples/inference/run_with_different_backends.py
index 428a8a04f..5b018c036 100644
--- a/examples/inference/run_with_different_backends.py
+++ b/examples/inference/run_with_different_backends.py
@@ -3,9 +3,11 @@
 import sys
 from argparse import ArgumentParser
 
-from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./TinyLlama/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py
index d2aba4e3b..1fb6ce61d 100644
--- a/examples/quantization/basic_usage.py
+++ b/examples/quantization/basic_usage.py
@@ -1,8 +1,10 @@
 import os
 
-from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
diff --git a/examples/quantization/basic_usage_autoround.py b/examples/quantization/basic_usage_autoround.py
index 4b0e2e0e6..ecf1ca363 100644
--- a/examples/quantization/basic_usage_autoround.py
+++ b/examples/quantization/basic_usage_autoround.py
@@ -1,7 +1,9 @@
 import torch
+from transformers import AutoTokenizer
+
 from gptqmodel import GPTQModel
 from gptqmodel.quantization.config import AutoRoundQuantizeConfig  # noqa: E402
-from transformers import AutoTokenizer
+
 
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
diff --git a/examples/quantization/basic_usage_wikitext2.py b/examples/quantization/basic_usage_wikitext2.py
index c93af66a2..1c07aa6ed 100644
--- a/examples/quantization/basic_usage_wikitext2.py
+++ b/examples/quantization/basic_usage_wikitext2.py
@@ -1,8 +1,10 @@
 import torch
 from datasets import load_dataset
-from gptqmodel import GPTQModel, QuantizeConfig
 from transformers import AutoTokenizer
 
+from gptqmodel import GPTQModel, QuantizeConfig
+
+
 pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"
 
diff --git a/examples/quantization/transformers_usage.py b/examples/quantization/transformers_usage.py
index d6e279d29..03f4b5100 100755
--- a/examples/quantization/transformers_usage.py
+++ b/examples/quantization/transformers_usage.py
@@ -1,5 +1,6 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 
+
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
index fa771cce9..277c43d11 100644
--- a/gptqmodel/__init__.py
+++ b/gptqmodel/__init__.py
@@ -1,5 +1,5 @@
 from .models import GPTQModel, get_best_device
-from .utils import BACKEND
 from .quantization import BaseQuantizeConfig, QuantizeConfig
+from .utils import BACKEND
 from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
diff --git a/gptqmodel/integration/src/optimum/gptq/quantizer.py b/gptqmodel/integration/src/optimum/gptq/quantizer.py
index f87d99d7d..4706b38f3 100644
--- a/gptqmodel/integration/src/optimum/gptq/quantizer.py
+++ b/gptqmodel/integration/src/optimum/gptq/quantizer.py
@@ -625,7 +625,7 @@ def tmp(_, input, output):
                     h.remove()
                 for name in subset_name_list:
                     logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
-                    quant_outputs = gptq[name].hf_quantize(
+                    quant_outputs = gptq[name].fasterquant(
                         percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
                     )
                     scale, zero, g_idx = quant_outputs[0], quant_outputs[1], quant_outputs[2]
diff --git a/gptqmodel/integration/src/transformers/quantizers/quantizer_gptq.py b/gptqmodel/integration/src/transformers/quantizers/quantizer_gptq.py
index f9b3416c5..e6f0d6d15 100644
--- a/gptqmodel/integration/src/transformers/quantizers/quantizer_gptq.py
+++ b/gptqmodel/integration/src/transformers/quantizers/quantizer_gptq.py
@@ -72,7 +72,7 @@ def validate_environment(self, *args, **kwargs):
             )
         elif is_gptqmodel_available() and (
             version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3")
-            or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99")
+            or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.3")
         ):
             raise ImportError("The gptqmodel version should be >= 1.4.3, optimum version should >= 1.24.0")
 
diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py
index 97bf0bc69..b8b7bd368 100644
--- a/gptqmodel/models/_const.py
+++ b/gptqmodel/models/_const.py
@@ -1,11 +1,11 @@
-import sys
 from enum import Enum
 
 import torch
 from torch import device
 
 from ..utils import BACKEND
-from ..utils.torch import HAS_XPU, HAS_MPS, HAS_CUDA
+from ..utils.torch import HAS_CUDA, HAS_MPS, HAS_XPU
+
 
 CPU = device("cpu")
 CUDA = device("cuda")
@@ -15,11 +15,18 @@
 MPS = device("mps")
 
 class DEVICE(str, Enum):
+    ALL = "all" # All device
     CPU = "cpu" # All CPU
     CUDA = "cuda" # Nvidia GPU
     XPU = "xpu" # Intel GPU
     MPS = "mps" # MacOS GPU
 
+class PLATFORM(str, Enum):
+    ALL = "all" # All platform
+    LINUX = "linux" # linux
+    WIN32 = "win32" # windows
+    DARWIN = "darwin" # macos
+
 
 def validate_cuda_support(raise_exception: bool = False):
     got_cuda = HAS_CUDA
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index 64149561f..6cbd9eed1 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -1,7 +1,9 @@
 from __future__ import annotations
+
 import os
 import sys
 
+
 # TODO: waiting for pytorch implementgation of aten ops for MPS
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -68,6 +70,7 @@
 from .definitions.yi import YiGPTQ
 from .definitions.ovis import OvisGPTQ
 
+
 logger = setup_logger()
 
 MODEL_MAP = {
@@ -124,7 +127,6 @@
 
 HAS_IPEX = False
 try:
-    from intel_extension_for_pytorch.llm.quantization import IPEXWeightOnlyQuantizedLinear
     HAS_IPEX = True
 except Exception:
     pass
@@ -276,10 +278,11 @@ def eval(
                 if task not in EVAL.get_task_enums():
                     raise ValueError(f"lm_eval support tasks: {EVAL.get_all_tasks_string()}")
 
-            from gptqmodel.utils.eval import lm_eval
             from lm_eval.utils import make_table
             from transformers import AutoTokenizer
 
+            from gptqmodel.utils.eval import lm_eval
+
             tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)
 
             model_name = 'hf' if backend == 'gptqmodel' else backend
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 257a8d7d8..a89701de5 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -19,14 +19,23 @@
 from ..utils.device import get_cpu_usage_memory, get_gpu_usage_memory
 from ..utils.importer import select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.model import (check_to_quantized, find_layers, get_device, get_module_by_name_prefix,
-                           get_module_by_name_suffix, get_moe_layer_modules, move_to,
-                           nested_move_to, pack_model, simple_dispatch_model)
+from ..utils.model import (
+    check_to_quantized,
+    find_layers,
+    get_device,
+    get_module_by_name_prefix,
+    get_module_by_name_suffix,
+    get_moe_layer_modules,
+    move_to,
+    nested_move_to,
+    pack_model,
+    simple_dispatch_model,
+)
 from ..utils.progress import ProgressBar
-from ._const import CPU, get_best_device, DEVICE
+from ..utils.torch import torch_empty_cache
+from ._const import CPU, DEVICE, get_best_device
 from .loader import ModelLoader
 from .writer import QUANT_LOG_DAMP, QUANT_LOG_LAYER, QUANT_LOG_LOSS, QUANT_LOG_MODULE, QUANT_LOG_TIME, ModelWriter
-from ..utils.torch import torch_empty_cache
 
 
 def check_support_param_buffer_assignment(*args, **kwargs):
diff --git a/gptqmodel/models/definitions/gemma2.py b/gptqmodel/models/definitions/gemma2.py
index 15cbdba01..2bde8126d 100644
--- a/gptqmodel/models/definitions/gemma2.py
+++ b/gptqmodel/models/definitions/gemma2.py
@@ -2,6 +2,7 @@
 from ...utils.logger import setup_logger
 from ..base import BaseGPTQModel
 
+
 logger = setup_logger()
 
 SUPPORT_ERR = "Currently, only vLLM/SGLang with flashinfer enabled can correctly inference a quantized Gemma2-27B model. Pre-quantized model with sample vLLM code: https://huggingface.co/ModelCloud/gemma-2-27b-it-gptq-4bit ."
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
index 53ebf6bae..f5aeb4d6e 100644
--- a/gptqmodel/models/loader.py
+++ b/gptqmodel/models/loader.py
@@ -16,15 +16,27 @@
 from ..quantization import QuantizeConfig
 from ..quantization.config import FORMAT, FORMAT_FIELD_JSON, MIN_VERSION_WITH_V2
 from ..utils.backend import BACKEND
-from ..utils.importer import select_quant_linear, select_device
+from ..utils.importer import select_device, select_quant_linear
 from ..utils.logger import setup_logger
-from ..utils.marlin import (_validate_marlin_compatibility,
-                            _validate_marlin_device_support, prepare_model_for_marlin_load)
-from ..utils.model import (auto_dtype_from_config, convert_gptq_v1_to_v2_format, find_layers,
-                           get_checkpoints, get_moe_layer_modules, gptqmodel_post_init, make_quant,
-                           simple_dispatch_model, verify_model_hash, verify_sharded_model_hashes)
+from ..utils.marlin import (
+    _validate_marlin_compatibility,
+    _validate_marlin_device_support,
+    prepare_model_for_marlin_load,
+)
+from ..utils.model import (
+    auto_dtype_from_config,
+    convert_gptq_v1_to_v2_format,
+    find_layers,
+    get_checkpoints,
+    get_moe_layer_modules,
+    gptqmodel_post_init,
+    make_quant,
+    simple_dispatch_model,
+    verify_model_hash,
+    verify_sharded_model_hashes,
+)
 from ._const import DEVICE, SUPPORTED_MODELS, normalize_device
-from ..utils.torch import HAS_CUDA, HAS_XPU, HAS_MPS
+
 
 logger = setup_logger()
 
diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py
index 2f2f5a0ef..50ae49ecd 100644
--- a/gptqmodel/models/writer.py
+++ b/gptqmodel/models/writer.py
@@ -18,17 +18,34 @@
 from transformers.modeling_utils import no_init_weights
 from transformers.utils.generic import ContextManagers
 
-from ..quantization.config import (FORMAT, META_FIELD_DAMP_AUTO_INCREMENT, META_FIELD_DAMP_PERCENT, META_FIELD_MSE,
-                                   META_FIELD_QUANTIZER, META_FIELD_STATIC_GROUPS, META_FIELD_TRUE_SEQUENTIAL,
-                                   META_FIELD_URI, META_QUANTIZER_GPTQMODEL, META_VALUE_URI, MIN_VERSION_WITH_V2)
+from ..quantization.config import (
+    FORMAT,
+    META_FIELD_DAMP_AUTO_INCREMENT,
+    META_FIELD_DAMP_PERCENT,
+    META_FIELD_MSE,
+    META_FIELD_QUANTIZER,
+    META_FIELD_STATIC_GROUPS,
+    META_FIELD_TRUE_SEQUENTIAL,
+    META_FIELD_URI,
+    META_QUANTIZER_GPTQMODEL,
+    META_VALUE_URI,
+    MIN_VERSION_WITH_V2,
+)
 from ..utils.backend import BACKEND
 from ..utils.logger import setup_logger
-from ..utils.model import (convert_gptq_v2_to_v1_format, copy_py_files, find_layers,
-                           get_model_files_size, get_moe_layer_modules, make_quant)
+from ..utils.model import (
+    convert_gptq_v2_to_v1_format,
+    copy_py_files,
+    find_layers,
+    get_model_files_size,
+    get_moe_layer_modules,
+    make_quant,
+)
 from ..utils.torch import torch_empty_cache
 from ..version import __version__
 from ._const import CPU
 
+
 logger = setup_logger()
 
 QUANT_LOG_LAYER = "layer"
diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py
index 1190fc489..40886483f 100644
--- a/gptqmodel/nn_modules/qlinear/__init__.py
+++ b/gptqmodel/nn_modules/qlinear/__init__.py
@@ -1,8 +1,11 @@
+import sys
 from typing import List, Optional, Tuple, Union
 
+import torch
 import torch.nn as nn
 
-from ...models._const import DEVICE, normalize_device
+from ...models._const import DEVICE, PLATFORM, normalize_device
+
 
 class BaseQuantLinear(nn.Module):
     SUPPORTS_BITS: List[int] = None
@@ -16,6 +19,7 @@ class BaseQuantLinear(nn.Module):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY: List[int] = None
 
     SUPPORTS_DEVICES: List[DEVICE] = None
+    SUPPORTS_PLATFORM: List[PLATFORM] = None
 
     def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeatures: int, outfeatures: int, *args,
                  **kwargs):
@@ -72,7 +76,11 @@ def _validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, dynami
                   outfeatures:int=None, device:Optional[DEVICE]=None, trainable:Optional[bool]=None) -> Tuple[bool, Optional[Exception]]:
         cls.verify_supports_params()
 
-        if device is not None:
+        if PLATFORM.ALL not in cls.SUPPORTS_PLATFORM and sys.platform not in cls.SUPPORTS_PLATFORM:
+            err = f"{cls} does not support platform: {sys.platform}"
+            return False, NotImplementedError(err)
+
+        if DEVICE.ALL not in cls.SUPPORTS_DEVICES and device is not None:
             try:
                 cls.validate_device(device)
             except NotImplementedError:
@@ -150,7 +158,7 @@ def _validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, dynami
         return True, None
 
     @classmethod
-    def validate_device(cls, device: DEVICE):
+    def validate_device(cls, device: str|DEVICE|int|torch.device):
         dev = normalize_device(device)
 
         if dev not in cls.SUPPORTS_DEVICES:
diff --git a/gptqmodel/nn_modules/qlinear/bitblas.py b/gptqmodel/nn_modules/qlinear/bitblas.py
index e13763e82..00b89787a 100644
--- a/gptqmodel/nn_modules/qlinear/bitblas.py
+++ b/gptqmodel/nn_modules/qlinear/bitblas.py
@@ -9,11 +9,13 @@
 import numpy as np
 import torch
 import torch.nn as nn
+
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 
-from ...models._const import DEVICE
+from ...models._const import DEVICE, PLATFORM
 from ...utils.logger import setup_logger
 
+
 logger = setup_logger()
 
 BITBLAS_TARGET = None
@@ -86,6 +88,7 @@ class BitBLASQuantLinear(BaseQuantLinear):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [16]
 
     SUPPORTS_DEVICES = [DEVICE.CUDA]
+    SUPPORTS_PLATFORM = [PLATFORM.LINUX, PLATFORM.WIN32]
 
     OPT_FEATURES = [1, 16, 32, 64, 128, 256, 512]
     zeros_mode = "quantized"  # "original" or "rescale" or "quantized"
@@ -136,12 +139,10 @@ def __init__(
         self.reset_parameters()
 
     @classmethod
-    def validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, infeatures:int=None,
-                  outfeatures:int=None, dynamic:Optional[dict]=None, device:Optional[DEVICE]=None, trainable:Optional[bool]=None) -> Tuple[
-        bool, Optional[Exception]]:
+    def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
         if not BITBLAS_AVAILABLE:
             return False, ValueError(BITBLAS_INSTALL_HINT)
-        return cls._validate(bits=bits, group_size=group_size, desc_act=desc_act, sym=sym, dynamic=dynamic, device=device, trainable=trainable)
+        return cls._validate(**args)
 
     def _validate_parameters(
         self, group_size: int, infeatures: int, outfeatures: int
diff --git a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
index b98dd29ab..5034e5c22 100644
--- a/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
+++ b/gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
@@ -8,6 +8,7 @@
 
 from ...utils.logger import setup_logger
 
+
 logger = setup_logger()
 
 TARGET_MISSING_ERROR = (
diff --git a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
index 28a29ac04..dcbbe9606 100644
--- a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
+++ b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
@@ -1,10 +1,12 @@
 # License: GPTQModel/licenses/LICENSE.apache
 
 import torch
+
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear
 from gptqmodel.utils.logger import setup_logger
 
-from ...models._const import DEVICE
+from ...models._const import DEVICE, PLATFORM
+
 
 logger = setup_logger()
 
@@ -29,6 +31,7 @@ class DynamicCudaQuantLinear(TorchQuantLinear):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [64]
 
     SUPPORTS_DEVICES = [DEVICE.CUDA]
+    SUPPORTS_PLATFORM = [PLATFORM.LINUX, PLATFORM.WIN32]
 
     # for transformers/optimum tests compat
     QUANT_TYPE = "cuda"
diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py
index 569bd9bbd..85669e037 100644
--- a/gptqmodel/nn_modules/qlinear/exllama.py
+++ b/gptqmodel/nn_modules/qlinear/exllama.py
@@ -3,15 +3,18 @@
 
 import math
 from logging import getLogger
+from typing import Optional, Tuple
 
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import transformers
+
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 
-from ...models._const import DEVICE
+from ...models._const import DEVICE, PLATFORM
+
 
 exllama_import_exception = None
 try:
@@ -54,6 +57,7 @@ class ExllamaQuantLinear(BaseQuantLinear):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
 
     SUPPORTS_DEVICES = [DEVICE.CUDA]
+    SUPPORTS_PLATFORM = [PLATFORM.LINUX]
 
     # for transformers/optimum tests compat
     QUANT_TYPE = "exllama"
@@ -111,6 +115,12 @@ def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeat
         else:
             self.bias = None
 
+    @classmethod
+    def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
+        if exllama_import_exception is not None:
+            return False, exllama_import_exception
+        return cls._validate(**args)
+
     def post_init(self):
         self.validate_device(self.qweight.device.type)
         assert self.qweight.device.index is not None
diff --git a/gptqmodel/nn_modules/qlinear/exllamav2.py b/gptqmodel/nn_modules/qlinear/exllamav2.py
index 6d977a868..7088c0279 100644
--- a/gptqmodel/nn_modules/qlinear/exllamav2.py
+++ b/gptqmodel/nn_modules/qlinear/exllamav2.py
@@ -2,14 +2,17 @@
 # Adapted from turboderp exllama: https://github.com/turboderp/exllamav2
 
 import math
+from typing import Optional, Tuple
 
 import torch
 import torch.nn.functional as F
+
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 
-from ...models._const import DEVICE
+from ...models._const import DEVICE, PLATFORM
 from ...utils.logger import setup_logger
 
+
 exllama_v2_import_exception = None
 try:
     from gptqmodel_exllamav2_kernels import gemm_half_q_half, make_q_matrix
@@ -115,6 +118,7 @@ class ExllamaV2QuantLinear(BaseQuantLinear):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
 
     SUPPORTS_DEVICES = [DEVICE.CUDA]
+    SUPPORTS_PLATFORM = [PLATFORM.LINUX]
 
     # for transformers/optimum tests compat
     QUANT_TYPE = "exllamav2"
@@ -178,6 +182,12 @@ def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeat
         else:
             self.bias = None
 
+    @classmethod
+    def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
+        if exllama_v2_import_exception is not None:
+            return False, exllama_v2_import_exception
+        return cls._validate(**args)
+
     def post_init(self, temp_dq):
         self.validate_device(self.qweight.device.type)
         assert self.qweight.device.index is not None
diff --git a/gptqmodel/nn_modules/qlinear/ipex.py b/gptqmodel/nn_modules/qlinear/ipex.py
index 37fec5fa8..0e1c27c87 100644
--- a/gptqmodel/nn_modules/qlinear/ipex.py
+++ b/gptqmodel/nn_modules/qlinear/ipex.py
@@ -8,12 +8,14 @@
 import torch
 import torch.nn as nn
 import transformers
-from gptqmodel.models._const import DEVICE
+
+from gptqmodel.models._const import DEVICE, PLATFORM
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 
 from ...utils.logger import setup_logger
 from ...utils.torch import HAS_XPU
 
+
 logger = setup_logger()
 
 BITS_DTYPE_MAPPING = {
@@ -64,6 +66,7 @@ class IPEXQuantLinear(BaseQuantLinear):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [1]
 
     SUPPORTS_DEVICES = [DEVICE.CPU, DEVICE.XPU]
+    SUPPORTS_PLATFORM = [PLATFORM.LINUX]
 
     # for transformers/optimum tests compat
     QUANT_TYPE = "ipex"
@@ -133,23 +136,21 @@ def __init__(
         self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
 
     @classmethod
-    def validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, infeatures:int=None,
-                  outfeatures:int=None, dynamic:Optional[dict]=None, device:Optional[DEVICE]=None, trainable:Optional[bool]=None) -> Tuple[
-        bool, Optional[Exception]]:
+    def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
         if sys.platform != "linux":
             return False, Exception("IPEX is only available on Linux platform.")
 
         if not HAS_IPEX:
             return False, IPEX_ERROR_LOG
-        return cls._validate(bits=bits, group_size=group_size, desc_act=desc_act, sym=sym, dynamic=dynamic, device=device, trainable=trainable)
+        return cls._validate(**args)
 
     def post_init(self):
         self.validate_device(self.qweight.device.type)
 
     def init_ipex_linear(self, x: torch.Tensor):
         if not self.training and HAS_IPEX and not x.requires_grad:
-            self.ipex_linear = IPEXWeightOnlyQuantizedLinear.from_weight(self.qweight, self.scales, self.qzeros, \
-                                                                    self.infeatures, self.outfeatures, None, self.bias, \
+            self.ipex_linear = IPEXWeightOnlyQuantizedLinear.from_weight(self.qweight, self.scales, self.qzeros,
+                                                                    self.infeatures, self.outfeatures, None, self.bias,
                                                                     self.group_size, self.g_idx, quant_method=0, dtype=0)
 
     def pack(self, linear, scales, zeros, g_idx=None):
diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
index 9141d3afb..37768edd6 100644
--- a/gptqmodel/nn_modules/qlinear/marlin.py
+++ b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -5,10 +5,12 @@
 
 import numpy as np
 import torch
-from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 from torch.nn.parameter import Parameter
 
-from ...models._const import DEVICE
+from gptqmodel.nn_modules.qlinear import BaseQuantLinear
+
+from ...models._const import DEVICE, PLATFORM
+
 
 marlin_import_exception = None
 try:
@@ -150,6 +152,7 @@ class MarlinQuantLinear(BaseQuantLinear):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [64]
 
     SUPPORTS_DEVICES = [DEVICE.CUDA]
+    SUPPORTS_PLATFORM = [PLATFORM.LINUX]
 
     # for transformers/optimum tests compat
     QUANT_TYPE = "marlin"
@@ -283,6 +286,12 @@ def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeat
         else:
             self.bias = None
 
+    @classmethod
+    def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
+        if marlin_import_exception is not None:
+            return False, marlin_import_exception
+        return cls._validate(**args)
+
     def post_init(self):
         device = self.qweight.device
         self.validate_device(device.type)
diff --git a/gptqmodel/nn_modules/qlinear/torch.py b/gptqmodel/nn_modules/qlinear/torch.py
index 47ef422c8..4fde4b552 100644
--- a/gptqmodel/nn_modules/qlinear/torch.py
+++ b/gptqmodel/nn_modules/qlinear/torch.py
@@ -7,10 +7,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import transformers
+
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear
 from gptqmodel.utils.logger import setup_logger
 
-from ...models._const import DEVICE
+from ...models._const import DEVICE, PLATFORM
+
 
 logger = setup_logger()
 
@@ -25,7 +27,8 @@ class TorchQuantLinear(BaseQuantLinear):
     SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [1]
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [1]
 
-    SUPPORTS_DEVICES = [DEVICE.CPU, DEVICE.XPU, DEVICE.CUDA, DEVICE.MPS]
+    SUPPORTS_DEVICES = [DEVICE.ALL]
+    SUPPORTS_PLATFORM = [PLATFORM.ALL]
 
     # for transformers/optimum tests compat
     QUANT_TYPE = "torch"
diff --git a/gptqmodel/nn_modules/qlinear/tritonv2.py b/gptqmodel/nn_modules/qlinear/tritonv2.py
index 6e617eb59..84cdacbef 100644
--- a/gptqmodel/nn_modules/qlinear/tritonv2.py
+++ b/gptqmodel/nn_modules/qlinear/tritonv2.py
@@ -10,11 +10,12 @@
 import transformers
 from packaging import version
 
-from ...models._const import DEVICE
+from ...models._const import DEVICE, PLATFORM
 from ...utils.logger import setup_logger
 from ..triton_utils.mixin import TritonModuleMixin
 from . import BaseQuantLinear
 
+
 try:
     from triton import __version__ as triton_version
 
@@ -42,6 +43,7 @@ class TritonV2QuantLinear(BaseQuantLinear, TritonModuleMixin):
     SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
 
     SUPPORTS_DEVICES = [DEVICE.CUDA]
+    SUPPORTS_PLATFORM = [PLATFORM.LINUX, PLATFORM.WIN32]
 
     # for transformers/optimum tests compat
     QUANT_TYPE = "tritonv2"
@@ -98,12 +100,10 @@ def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeat
             self.bias = None
 
     @classmethod
-    def validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, infeatures:int=None,
-                  outfeatures:int=None, dynamic:Optional[dict]=None, device:Optional[DEVICE]=None, trainable:Optional[bool]=None) -> Tuple[
-        bool, Optional[Exception]]:
+    def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
         if not TRITON_AVAILABLE:
             return False, ValueError(TRITON_INSTALL_HINT)
-        return cls._validate(bits=bits, group_size=group_size, desc_act=desc_act, sym=sym, dynamic=dynamic, device=device, trainable=trainable)
+        return cls._validate(**args)
 
     def post_init(self):
         self.validate_device(self.qweight.device.type)
diff --git a/gptqmodel/nn_modules/triton_utils/custom_autotune.py b/gptqmodel/nn_modules/triton_utils/custom_autotune.py
index fde5ca2cc..9356e8b13 100644
--- a/gptqmodel/nn_modules/triton_utils/custom_autotune.py
+++ b/gptqmodel/nn_modules/triton_utils/custom_autotune.py
@@ -5,6 +5,7 @@
 
 import triton
 
+
 #  code based https://github.com/fpgaminer/GPTQ-triton
 """
 Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100.
diff --git a/gptqmodel/nn_modules/triton_utils/kernels.py b/gptqmodel/nn_modules/triton_utils/kernels.py
index 40a45bee2..7150d34dd 100644
--- a/gptqmodel/nn_modules/triton_utils/kernels.py
+++ b/gptqmodel/nn_modules/triton_utils/kernels.py
@@ -6,6 +6,7 @@
 from ...utils.logger import setup_logger
 from . import custom_autotune
 
+
 logger = setup_logger()
 
 
diff --git a/gptqmodel/quantization/__init__.py b/gptqmodel/quantization/__init__.py
index d97184ac4..a9e03bbcf 100644
--- a/gptqmodel/quantization/__init__.py
+++ b/gptqmodel/quantization/__init__.py
@@ -1,4 +1,13 @@
-from .config import (FORMAT, FORMAT_FIELD_CODE, FORMAT_FIELD_COMPAT_MARLIN, FORMAT_FIELD_JSON,
-                     QUANT_CONFIG_FILENAME, QUANT_METHOD, QUANT_METHOD_FIELD, BaseQuantizeConfig, QuantizeConfig)
+from .config import (
+                     FORMAT,
+                     FORMAT_FIELD_CODE,
+                     FORMAT_FIELD_COMPAT_MARLIN,
+                     FORMAT_FIELD_JSON,
+                     QUANT_CONFIG_FILENAME,
+                     QUANT_METHOD,
+                     QUANT_METHOD_FIELD,
+                     BaseQuantizeConfig,
+                     QuantizeConfig,
+)
 from .gptq import GPTQ
 from .quantizer import Quantizer, quantize
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
index 02c254a06..fc29cf35b 100644
--- a/gptqmodel/quantization/config.py
+++ b/gptqmodel/quantization/config.py
@@ -98,6 +98,21 @@ def dict_scale_dtype_to_str(d: Dict[str, Any]) -> None:
         if isinstance(value, dict):
             dict_scale_dtype_to_str(value)
 
+
+def dynamic_get(dynamic: Dict[str, Dict[str, Union[int, bool]]], layer_name: str, key: str = None,
+                default_value: Union[int, bool] = None) -> Union[Dict, int, bool]:
+    for pattern, pattern_dict in dynamic.items():
+        if pattern.startswith("-:"):
+            if re.match(pattern.removeprefix("-:"), layer_name):
+                return False
+        elif re.match(pattern.removeprefix("+:"), layer_name):
+            if key is None:
+                return pattern_dict
+            else:
+                return pattern_dict.get(key, default_value)
+    return default_value
+
+
 @dataclass
 class QuantizeConfig():
     bits: int = field(default=4, metadata={"choices": [2, 3, 4, 8]})
@@ -183,16 +198,7 @@ def meta_get(self, key: str) -> Any:
         return self.meta.get(key)
 
     def dynamic_get(self, layer_name: str, key: str = None, default_value: Union[int, bool] = None) -> Union[Dict, int, bool]:
-        for pattern, pattern_dict in self.dynamic.items():
-            if pattern.startswith("-:"):
-                if re.match(pattern.removeprefix("-:"), layer_name):
-                    return False
-            elif re.match(pattern.removeprefix("+:"), layer_name):
-                if key is None:
-                    return pattern_dict
-                else:
-                    return pattern_dict.get(key, default_value)
-        return default_value
+        return dynamic_get(self.dynamic, layer_name, key, default_value)
 
     # versionable is a meta.property that pairs value with version i.e "value:1.0.0"
     def meta_set_versionable(self, key: str, value: List[str]):
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
index c04b445a2..647f02694 100644
--- a/gptqmodel/quantization/gptq.py
+++ b/gptqmodel/quantization/gptq.py
@@ -11,11 +11,13 @@
 import transformers
 
 from ..utils.logger import setup_logger
+from ..utils.torch import torch_empty_cache, torch_sync
 from .quantizer import Quantizer
-from ..utils.torch import torch_sync, torch_empty_cache
+
 
 logger = setup_logger()
 
+# TODO do we really need max precision?
 torch.backends.cuda.matmul.allow_tf32 = False
 torch.backends.cudnn.allow_tf32 = False
 
@@ -23,20 +25,29 @@
 class GPTQ:
     def __init__(self, layer):
         self.layer = layer
-        self.dev = self.layer.weight.device
-        W = layer.weight.data.clone()
+        self.device = self.layer.weight.device
+
+        self.layer_copy = self._clone_layer()
+
+        self.rows, self.columns = self.layer_copy.shape[0], self.layer_copy.shape[1]
+        self.H = torch.zeros((self.columns, self.columns), device=self.device)
+        self.nsamples = 0
+        self.quantizer = Quantizer()
+
+    def _clone_layer(self):
+        # mps for m1+ is unified memory
+        if self.device.type not in ["mps", "cpu"]:
+            clone = self.layer.weight.data.cpu()
+        else:
+            clone = self.layer.weight.data.clone()
 
         if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
+            clone = clone.flatten(1)
 
         if isinstance(self.layer, transformers.pytorch_utils.Conv1D):
-            W = W.t()
+            clone = clone.t()
 
-        self.rows = W.shape[0]
-        self.columns = W.shape[1]
-        self.H = torch.zeros((self.columns, self.columns), device=self.dev)
-        self.nsamples = 0
-        self.quantizer = Quantizer()
+        return clone.to(device=self.device, dtype=torch.float)
 
     def add_batch(self, inp, out):
         if os.environ.get("DEBUG"):
@@ -70,14 +81,28 @@ def add_batch(self, inp, out):
         # self.H += 2 / self.nsamples * inp.matmul(inp.t())
         self.H += inp.matmul(inp.t())
 
+    # wrapper for backward compat with optimum
+    # TODO: mark for deprecation
+    def fasterquant(
+        self,
+        blocksize=128,
+        percdamp=0.01,
+        damp_auto_increment=0.0015,
+        group_size=-1,
+        actorder=False,
+        static_groups=False,
+    ):
+        return self.hf_quantize(blocksize, percdamp, damp_auto_increment, group_size, actorder, static_groups)
+
+    # public api exposed to hf
     def hf_quantize(
-            self,
-            blocksize=128,
-            percdamp=0.01,
-            damp_auto_increment=0.0015,
-            group_size=-1,
-            actorder=False,
-            static_groups=False,
+        self,
+        blocksize=128,
+        percdamp=0.01,
+        damp_auto_increment=0.0015,
+        group_size=-1,
+        actorder=False,
+        static_groups=False,
     ):
         return self.quantize(blocksize, percdamp, damp_auto_increment, group_size, actorder, static_groups)
 
@@ -91,24 +116,16 @@ def quantize(
         actorder=False,
         static_groups=False,
     ):
+        start = time.time()
         # TODO: waiting for pytorch implementation of ops for MPS
         if sys.platform == "darwin" and os.getenv("PYTORCH_ENABLE_MPS_FALLBACK") != "1":
             raise RuntimeError("For MacOS you must set env `PYTORCH_ENABLE_MPS_FALLBACK=1` before running quantization.")
 
-        # save mem and temp move to cpu
-        self.layer.weight.data = self.layer.weight.data.cpu()
-
-        W = self.layer.weight.data.clone()
-
-        if isinstance(self.layer, nn.Conv2d):
-            W = W.flatten(1)
-
-        if isinstance(self.layer, transformers.Conv1D):
-            W = W.t()
-
-        W = W.to(device=self.dev, dtype=torch.float)
-
-        tick = time.time()
+        if self.layer_copy is None:
+            W = self._clone_layer()
+        else:
+            W = self.layer_copy
+            self.layer_copy = None
 
         if not self.quantizer.ready():
             self.quantizer.find_params(W, weight=True)
@@ -119,7 +136,7 @@ def quantize(
         H[dead, dead] = 1
         W[:, dead] = 0
 
-        g_idx = []
+        # g_idx = []
         scale = []
         zero = []
         now_idx = 1
@@ -148,7 +165,7 @@ def quantize(
         while 1 > percdamp > 0:
             try:
                 damp = percdamp * torch.mean(torch.diag(H))
-                diag = torch.arange(self.columns, device=self.dev)
+                diag = torch.arange(self.columns, device=self.device)
                 H[diag, diag] += damp
 
                 H = torch.linalg.cholesky(H)
@@ -217,9 +234,8 @@ def quantize(
                 logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
                 logger.debug(torch.sum(Losses))
 
-        torch_sync(self.dev)
+        torch_sync(self.device)
 
-        duration = time.time() - tick
         avg_loss = torch.sum(Losses).item() / self.nsamples
 
         if math.isnan(avg_loss):
@@ -248,7 +264,7 @@ def quantize(
             self.layer.weight.data = Q.cpu().type_as(self.layer.weight.data)
 
         # move back to self.dev
-        self.layer.weight.data = self.layer.weight.data.to(device=self.dev)
+        self.layer.weight.data = self.layer.weight.data.to(device=self.device)
 
         if os.environ.get("DEBUG"):
             logger.debug(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
@@ -260,6 +276,7 @@ def quantize(
         scale = torch.cat(scale, dim=1)
         zero = torch.cat(zero, dim=1)
 
+        duration = time.time() - start
         return scale, zero, g_idx, duration, avg_loss, percdamp
 
     def free(self):
@@ -271,7 +288,10 @@ def free(self):
         self.Losses = None
         self.Trace = None
 
-        torch_empty_cache(self.dev)
+        self.quantizer = None
+        self.layer_copy = None
+
+        torch_empty_cache(self.device)
 
 
 __all__ = ["GPTQ"]
diff --git a/gptqmodel/quantization/quantizer.py b/gptqmodel/quantization/quantizer.py
index f10ceacef..971777f2a 100644
--- a/gptqmodel/quantization/quantizer.py
+++ b/gptqmodel/quantization/quantizer.py
@@ -6,6 +6,7 @@
 
 from ..utils.logger import setup_logger
 
+
 logger = setup_logger()
 
 
diff --git a/gptqmodel/utils/bitblas.py b/gptqmodel/utils/bitblas.py
index be9bb9c43..76a6a63f8 100644
--- a/gptqmodel/utils/bitblas.py
+++ b/gptqmodel/utils/bitblas.py
@@ -1,4 +1,3 @@
-import gc
 import os
 
 import accelerate
@@ -6,12 +5,13 @@
 import torch
 from accelerate.utils import find_tied_parameters
 
-from .torch import torch_empty_cache
 from ..nn_modules.qlinear.bitblas import BitBLASQuantLinear
 from ..quantization import FORMAT, QuantizeConfig
 from ..utils.logger import setup_logger
 from .model import recurse_getattr, recurse_setattr
 from .progress import ProgressBar
+from .torch import torch_empty_cache
+
 
 logger = setup_logger()
 
diff --git a/gptqmodel/utils/device.py b/gptqmodel/utils/device.py
index dff9d5cac..4ba3e10c5 100644
--- a/gptqmodel/utils/device.py
+++ b/gptqmodel/utils/device.py
@@ -1,5 +1,6 @@
 
 from device_smi import Device
+
 from gptqmodel.models._const import CPU, CUDA_0
 
 
diff --git a/gptqmodel/utils/exllama.py b/gptqmodel/utils/exllama.py
index c7d717e80..68a65e49f 100644
--- a/gptqmodel/utils/exllama.py
+++ b/gptqmodel/utils/exllama.py
@@ -1,9 +1,8 @@
-import gc
 
 import torch
 
-from .torch import torch_empty_cache
 from ..nn_modules.qlinear.exllama import ExllamaQuantLinear
+from .torch import torch_empty_cache
 
 
 def exllama_set_max_input_length(model, max_input_length: int):
diff --git a/gptqmodel/utils/importer.py b/gptqmodel/utils/importer.py
index 8c29bcfa2..093d1a14a 100644
--- a/gptqmodel/utils/importer.py
+++ b/gptqmodel/utils/importer.py
@@ -1,10 +1,6 @@
 from collections import OrderedDict
-from typing import Dict, Optional, Type, Union, Tuple
+from typing import Dict, Optional, Tuple, Type, Union
 
-import torch
-
-from . import BACKEND
-from .torch import HAS_XPU, HAS_CUDA, HAS_MPS
 from ..models._const import DEVICE, normalize_device
 from ..nn_modules.qlinear import BaseQuantLinear
 from ..nn_modules.qlinear.bitblas import BitBLASQuantLinear
@@ -17,6 +13,9 @@
 from ..nn_modules.qlinear.tritonv2 import TRITON_AVAILABLE, TRITON_INSTALL_HINT, TritonV2QuantLinear
 from ..quantization import FORMAT
 from ..utils.logger import setup_logger
+from . import BACKEND
+from .torch import HAS_CUDA, HAS_MPS, HAS_XPU
+
 
 message_logged = False
 logger = setup_logger()
diff --git a/gptqmodel/utils/logger.py b/gptqmodel/utils/logger.py
index b62e21fcb..8a4fdbf2d 100644
--- a/gptqmodel/utils/logger.py
+++ b/gptqmodel/utils/logger.py
@@ -1,5 +1,6 @@
 import logging
 
+
 logger = None
 def setup_logger():
     global logger
diff --git a/gptqmodel/utils/marlin.py b/gptqmodel/utils/marlin.py
index a2c1a16c0..b612458fa 100644
--- a/gptqmodel/utils/marlin.py
+++ b/gptqmodel/utils/marlin.py
@@ -1,15 +1,15 @@
-import gc
 
 import accelerate
 import torch
 from accelerate.utils import find_tied_parameters
 
-from .torch import torch_empty_cache
 from ..nn_modules.qlinear.marlin import MarlinQuantLinear, _get_perms, unpack_qzeros
 from ..quantization import FORMAT, QuantizeConfig
 from ..utils.logger import setup_logger
 from .model import recurse_getattr, recurse_setattr
 from .progress import ProgressBar
+from .torch import torch_empty_cache
+
 
 logger = setup_logger()
 
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
index 16e86cdc8..59465e448 100644
--- a/gptqmodel/utils/model.py
+++ b/gptqmodel/utils/model.py
@@ -21,8 +21,7 @@
 from transformers import AutoConfig, PretrainedConfig
 from transformers.utils.hub import cached_file
 
-from .torch import torch_empty_cache
-from ..models._const import CPU, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS, DEVICE
+from ..models._const import CPU, DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTED_MODELS
 from ..nn_modules.qlinear import BaseQuantLinear
 from ..nn_modules.qlinear.exllama import ExllamaQuantLinear
 from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear
@@ -33,6 +32,8 @@
 from .importer import select_quant_linear
 from .logger import setup_logger
 from .progress import ProgressBar
+from .torch import torch_empty_cache
+from ..quantization.config import dynamic_get
 
 logger = setup_logger()
 
@@ -190,6 +191,10 @@ def create_quant_layer(QuantLinear, bits, desc_act, dynamic, group_size, module,
             d_sym = sym
             # dynamic bits, group_size, sym for each layer/module
             if dynamic is not None:
+                if dynamic_get(dynamic=dynamic, layer_name=name) == False:  # noqa: E712
+                    # skip create this quant linear
+                    continue
+
                 for pattern, pattern_dict in dynamic.items():
                     if re.match(pattern, name):
                         d_bits = pattern_dict.get("bits", bits)
diff --git a/gptqmodel/utils/perplexity.py b/gptqmodel/utils/perplexity.py
index 72f5bcd16..f2c8183bc 100644
--- a/gptqmodel/utils/perplexity.py
+++ b/gptqmodel/utils/perplexity.py
@@ -3,6 +3,7 @@
 import numpy as np
 import torch
 from datasets import load_dataset
+
 from gptqmodel.utils.progress import ProgressBar
 
 
diff --git a/gptqmodel/utils/sglang.py b/gptqmodel/utils/sglang.py
index b813bfbd0..2aba002a5 100644
--- a/gptqmodel/utils/sglang.py
+++ b/gptqmodel/utils/sglang.py
@@ -3,6 +3,7 @@
 import torch
 from transformers import AutoConfig
 
+
 try:
     import sglang as sgl
     SGLANG_AVAILABLE = True
diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py
index 8194ee34f..38a4be37a 100644
--- a/gptqmodel/utils/torch.py
+++ b/gptqmodel/utils/torch.py
@@ -1,6 +1,8 @@
-import torch
 import gc as py_gc
 
+import torch
+
+
 HAS_CUDA = False
 HAS_XPU = False
 HAS_MPS = False
@@ -53,4 +55,4 @@ def torch_empty_cache(device: torch.device = None, gc: bool = True):
     elif device.type == "xpu":
         torch.xpu.empty_cache()
     elif device.type == "mps":
-        torch.mps.empty_cache()
\ No newline at end of file
+        torch.mps.empty_cache()
diff --git a/gptqmodel/utils/vllm.py b/gptqmodel/utils/vllm.py
index 9d0d47d75..d9ff25e68 100644
--- a/gptqmodel/utils/vllm.py
+++ b/gptqmodel/utils/vllm.py
@@ -2,6 +2,7 @@
 
 import torch
 
+
 try:
     from vllm import LLM, SamplingParams
 
diff --git a/gptqmodel/version.py b/gptqmodel/version.py
index 19fec1e6f..81a610005 100644
--- a/gptqmodel/version.py
+++ b/gptqmodel/version.py
@@ -1 +1 @@
-__version__ = "1.4.4-dev"
+__version__ = "1.4.5-dev"
diff --git a/setup.py b/setup.py
index 0a8ab3edd..7fd617cbc 100644
--- a/setup.py
+++ b/setup.py
@@ -9,6 +9,7 @@
 from setuptools import find_packages, setup
 from setuptools.command.bdist_wheel import bdist_wheel as _bdist_wheel
 
+
 CUDA_RELEASE = os.environ.get("CUDA_RELEASE", None)
 
 TORCH_CUDA_ARCH_LIST = os.environ.get("TORCH_CUDA_ARCH_LIST")
@@ -88,6 +89,7 @@ def get_version_tag(is_cuda_release: bool = True) -> str:
 
 import torch  # noqa: E402
 
+
 if TORCH_CUDA_ARCH_LIST is None:
     got_cuda_v6 = any(torch.cuda.get_device_capability(i)[0] >= 6 for i in range(torch.cuda.device_count()))
     got_cuda_between_v6_and_v8 = any(6 <= torch.cuda.get_device_capability(i)[0] < 8 for i in range(torch.cuda.device_count()))
@@ -174,40 +176,47 @@ def get_version_tag(is_cuda_release: bool = True) -> str:
             extra_link_args=extra_link_args,
             extra_compile_args=extra_compile_args,
         ),
-        cpp_ext.CUDAExtension(
-            "gptqmodel_marlin_kernels",
-            [
-                "gptqmodel_ext/marlin/marlin_cuda.cpp",
-                "gptqmodel_ext/marlin/marlin_cuda_kernel.cu",
-                "gptqmodel_ext/marlin/marlin_repack.cu",
-            ],
-            extra_link_args=extra_link_args,
-            extra_compile_args=extra_compile_args,
-        ),
-        cpp_ext.CUDAExtension(
-            "gptqmodel_exllama_kernels",
-            [
-                "gptqmodel_ext/exllama/exllama_ext.cpp",
-                "gptqmodel_ext/exllama/cuda_buffers.cu",
-                "gptqmodel_ext/exllama/cuda_func/column_remap.cu",
-                "gptqmodel_ext/exllama/cuda_func/q4_matmul.cu",
-                "gptqmodel_ext/exllama/cuda_func/q4_matrix.cu",
-            ],
-            extra_link_args=extra_link_args,
-            extra_compile_args=extra_compile_args,
-        ),
-        cpp_ext.CUDAExtension(
-            "gptqmodel_exllamav2_kernels",
-            [
-                "gptqmodel_ext/exllamav2/ext.cpp",
-                "gptqmodel_ext/exllamav2/cuda/q_matrix.cu",
-                "gptqmodel_ext/exllamav2/cuda/q_gemm.cu",
-            ],
-            extra_link_args=extra_link_args,
-            extra_compile_args=extra_compile_args,
-        )
     ]
 
+    if sys.platform != "win32":
+        extensions += [
+            # TODO: VC++: fatal error C1061: compiler limit : blocks nested too deeply
+            cpp_ext.CUDAExtension(
+                "gptqmodel_marlin_kernels",
+                [
+                    "gptqmodel_ext/marlin/marlin_cuda.cpp",
+                    "gptqmodel_ext/marlin/marlin_cuda_kernel.cu",
+                    "gptqmodel_ext/marlin/marlin_repack.cu",
+                ],
+                extra_link_args=extra_link_args,
+                extra_compile_args=extra_compile_args,
+            ),
+            # TODO: VC++: error lnk2001 unresolved external symbol cublasHgemm
+            cpp_ext.CUDAExtension(
+                "gptqmodel_exllama_kernels",
+                [
+                    "gptqmodel_ext/exllama/exllama_ext.cpp",
+                    "gptqmodel_ext/exllama/cuda_buffers.cu",
+                    "gptqmodel_ext/exllama/cuda_func/column_remap.cu",
+                    "gptqmodel_ext/exllama/cuda_func/q4_matmul.cu",
+                    "gptqmodel_ext/exllama/cuda_func/q4_matrix.cu",
+                ],
+                extra_link_args=extra_link_args,
+                extra_compile_args=extra_compile_args,
+            ),
+            # TODO: VC++: error lnk2001 unresolved external symbol cublasHgemm
+            cpp_ext.CUDAExtension(
+                "gptqmodel_exllamav2_kernels",
+                [
+                    "gptqmodel_ext/exllamav2/ext.cpp",
+                    "gptqmodel_ext/exllamav2/cuda/q_matrix.cu",
+                    "gptqmodel_ext/exllamav2/cuda/q_gemm.cu",
+                ],
+                extra_link_args=extra_link_args,
+                extra_compile_args=extra_compile_args,
+            )
+        ]
+
     additional_setup_kwargs = {"ext_modules": extensions, "cmdclass": {"build_ext": cpp_ext.BuildExtension}}
 
 
diff --git a/tests/models/model_test.py b/tests/models/model_test.py
index 3a9ad0b01..7af9a7bcb 100644
--- a/tests/models/model_test.py
+++ b/tests/models/model_test.py
@@ -2,6 +2,7 @@
 import os
 import sys
 
+
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
@@ -13,15 +14,17 @@
 
 import torch.cuda  # noqa: E402
 from datasets import load_dataset  # noqa: E402
-from gptqmodel.utils.torch import torch_empty_cache # noqa: E402
+from lm_eval.utils import make_table  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
 from gptqmodel.quantization.config import QuantizeConfig  # noqa: E402
 from gptqmodel.utils.eval import lm_eval  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 from ovis_calibration_dataset import get_calib_dataset
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+
 
 RAND_SEED = 898
 
diff --git a/tests/models/test_opt.py b/tests/models/test_opt.py
index e0c8a8ad2..ff782ac21 100644
--- a/tests/models/test_opt.py
+++ b/tests/models/test_opt.py
@@ -1,6 +1,7 @@
+from model_test import ModelTest
+
 from gptqmodel import BACKEND
 from gptqmodel.utils.importer import backend_dict
-from model_test import ModelTest
 
 
 class TestOpt(ModelTest):
diff --git a/tests/test_asym_gptq_v1.py b/tests/test_asym_gptq_v1.py
index f2592ec64..a42a1df9d 100644
--- a/tests/test_asym_gptq_v1.py
+++ b/tests/test_asym_gptq_v1.py
@@ -1,11 +1,13 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-from gptqmodel.quantization import FORMAT  # noqa: E402
 # -- end do not touch
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel.quantization import FORMAT  # noqa: E402
+
 
 class Test(ModelTest):
     NATIVE_MODEL_ID = "ModelCloud/Llama3.2-1B-Instruct"  # "meta-llama/Llama-3.2-1B-Instruct"
diff --git a/tests/test_dynamic.py b/tests/test_dynamic.py
index 323f9cf61..795d7c52c 100644
--- a/tests/test_dynamic.py
+++ b/tests/test_dynamic.py
@@ -1,20 +1,22 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.quantization import QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestDynamic(unittest.TestCase):
@@ -112,3 +114,17 @@ def test_skip_module(self):
         for name, submodule in model.named_modules():
             if name == 'model.model.layers.0.self_attn.q_proj' and isinstance(submodule, BaseQuantLinear):  # module 0 was skipped
                 raise ValueError("first layer should be native module")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save(tmp_dir)
+            del model
+
+            q_model = GPTQModel.load(tmp_dir)
+            generate_str = self.tokenizer.decode(
+                q_model.generate(
+                    **self.tokenizer("The capital of France is is", return_tensors="pt").to(q_model.device),
+                    max_new_tokens=2)[0])
+
+            print(f"generate_str: {generate_str}")
+
+            self.assertIn("paris", generate_str.lower())
\ No newline at end of file
diff --git a/tests/test_estimate_vram.py b/tests/test_estimate_vram.py
index 2a1fe5387..edbee17ff 100644
--- a/tests/test_estimate_vram.py
+++ b/tests/test_estimate_vram.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import unittest  # noqa: E402
diff --git a/tests/test_eval.py b/tests/test_eval.py
index 62781ce51..2e53b6590 100644
--- a/tests/test_eval.py
+++ b/tests/test_eval.py
@@ -3,9 +3,11 @@
 import unittest
 from typing import Union
 
+from parameterized import parameterized
+
 from gptqmodel import GPTQModel
 from gptqmodel.utils import EVAL
-from parameterized import parameterized
+
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
diff --git a/tests/test_evalplus.py b/tests/test_evalplus.py
index 4ceda12eb..949e042b8 100644
--- a/tests/test_evalplus.py
+++ b/tests/test_evalplus.py
@@ -1,10 +1,13 @@
+# -- do not touch
 import os
-import tempfile
-import unittest
-
-from gptqmodel.utils.eval import evalplus
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# -- end do not touch
+
+import tempfile  # noqa: E402
+import unittest  # noqa: E402
+
+from gptqmodel.utils.eval import evalplus  # noqa: E402
 
 
 class TestEvalplus(unittest.TestCase):
diff --git a/tests/test_group_size.py b/tests/test_group_size.py
index 311d54960..91713107b 100644
--- a/tests/test_group_size.py
+++ b/tests/test_group_size.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import logging  # noqa: E402
@@ -8,6 +9,9 @@
 import traceback  # noqa: E402
 import unittest  # noqa: E402
 
+from lm_eval.utils import make_table  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.dynamic_cuda import DynamicCudaQuantLinear  # noqa: E402
@@ -18,8 +22,7 @@
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
 from gptqmodel.utils.eval import lm_eval  # noqa: E402
-from lm_eval.utils import make_table  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_ipex_xpu.py b/tests/test_ipex_xpu.py
new file mode 100644
index 000000000..b509977f7
--- /dev/null
+++ b/tests/test_ipex_xpu.py
@@ -0,0 +1,41 @@
+# -- do not touch
+import os
+
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# -- end do not touch
+
+import tempfile  # noqa: E402
+
+from models.model_test import ModelTest  # noqa: E402
+
+from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
+from gptqmodel.models._const import DEVICE  # noqa: E402
+
+
+class TestsIPEX(ModelTest):
+    NATIVE_MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct"
+
+    def test(self):
+        origin_model = GPTQModel.load(
+            self.NATIVE_MODEL_ID,
+            quantize_config=QuantizeConfig(),
+            backend=BACKEND.IPEX,
+            device=DEVICE.XPU,
+        )
+        tokenizer = self.load_tokenizer(self.NATIVE_MODEL_ID)
+        calibration_dataset = self.load_dataset(tokenizer)
+        origin_model.quantize(calibration_dataset)
+        with tempfile.TemporaryDirectory() as tmpdir:
+          origin_model.save(tmpdir)
+
+          model = GPTQModel.load(
+              tmpdir,
+              backend=BACKEND.IPEX,
+              device=DEVICE.XPU,
+          )
+          generate_str = tokenizer.decode(model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(model.device), max_new_tokens=2)[0])
+
+          print(f"generate_str: {generate_str}")
+
+          self.assertIn("paris", generate_str.lower())
diff --git a/tests/test_lm_eval.py b/tests/test_lm_eval.py
index afd3f2b99..bd33d7c6a 100644
--- a/tests/test_lm_eval.py
+++ b/tests/test_lm_eval.py
@@ -1,12 +1,15 @@
 # -- do not touch
 import os
+
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
-from gptqmodel.utils.eval import lm_eval  # noqa: E402
 from lm_eval.utils import make_table
 
+from gptqmodel.utils.eval import lm_eval  # noqa: E402
+
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 
diff --git a/tests/test_lm_head.py b/tests/test_lm_head.py
index 151ec33b8..8189aa48b 100644
--- a/tests/test_lm_head.py
+++ b/tests/test_lm_head.py
@@ -1,12 +1,14 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
+from models.model_test import ModelTest  # noqa: E402
+
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
 
 
 class TestLmHead(ModelTest):
diff --git a/tests/test_packing.py b/tests/test_packing.py
index a7e37d6c8..416f1d894 100644
--- a/tests/test_packing.py
+++ b/tests/test_packing.py
@@ -1,11 +1,13 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
+
 # isort: off
 import torch  # noqa: E402
 import torch.nn as nn  # noqa: E402
diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py
index 762b83662..ab2efdc21 100644
--- a/tests/test_perplexity.py
+++ b/tests/test_perplexity.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -8,11 +9,12 @@
 import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
+from parameterized import parameterized  # noqa: E402
+from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
+
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization.config import FORMAT, QUANT_METHOD, AutoRoundQuantizeConfig, QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity  # noqa: E402
-from parameterized import parameterized  # noqa: E402
-from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
 
 
 class TestPerplexity(unittest.TestCase):
diff --git a/tests/test_q4_bitblas.py b/tests/test_q4_bitblas.py
index 851222d85..6cc42ff4d 100644
--- a/tests/test_q4_bitblas.py
+++ b/tests/test_q4_bitblas.py
@@ -1,15 +1,17 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQ4BitBLAS(unittest.TestCase):
diff --git a/tests/test_q4_cuda.py b/tests/test_q4_cuda.py
index 29f66542c..8d2927937 100644
--- a/tests/test_q4_cuda.py
+++ b/tests/test_q4_cuda.py
@@ -1,16 +1,19 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
+
 GENERATE_EVAL_SIZE = 100
 
 
diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py
index 8fd3bc6bd..07f6f860e 100644
--- a/tests/test_q4_exllama_v1.py
+++ b/tests/test_q4_exllama_v1.py
@@ -1,20 +1,23 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 
 import torch  # noqa: E402
+from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params  # noqa: E402
+from models.model_test import ModelTest  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, exllama_set_max_input_length  # noqa: E402
 from gptqmodel.models._const import EXLLAMA_DEFAULT_MAX_INPUT_LENGTH  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
 from gptqmodel.utils.importer import select_quant_linear  # noqa: E402
 from gptqmodel.utils.model import gptqmodel_post_init  # noqa: E402
-from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 REFERENCE = torch.Tensor(
     [
diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py
index c684b6a25..ee136e257 100644
--- a/tests/test_q4_exllama_v2.py
+++ b/tests/test_q4_exllama_v2.py
@@ -1,19 +1,22 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
+from test_q4_exllama_v1 import REFERENCE, get_diff  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT  # noqa: E402
 from gptqmodel.utils.importer import select_quant_linear  # noqa: E402
 from gptqmodel.utils.model import gptqmodel_post_init  # noqa: E402
-from test_q4_exllama_v1 import REFERENCE, get_diff  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 GENERATE_EVAL_SIZE = 100
 
diff --git a/tests/test_q4_ipex.py b/tests/test_q4_ipex.py
index 1d963ac0f..f0518abb4 100644
--- a/tests/test_q4_ipex.py
+++ b/tests/test_q4_ipex.py
@@ -2,13 +2,16 @@
 import os
 import sys
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
 
+from gptqmodel import BACKEND  # noqa: E402
+
+
 GENERATE_EVAL_SIZE = 100
 
 
diff --git a/tests/test_q4_marlin.py b/tests/test_q4_marlin.py
index 52734df23..aa810e366 100644
--- a/tests/test_q4_marlin.py
+++ b/tests/test_q4_marlin.py
@@ -1,16 +1,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
-from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
+
 
 class TestQ4Marlin(unittest.TestCase):
 
diff --git a/tests/test_q4_torch.py b/tests/test_q4_torch.py
index cb95861d2..d55964771 100644
--- a/tests/test_q4_torch.py
+++ b/tests/test_q4_torch.py
@@ -1,16 +1,19 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import sys  # noqa: E402
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
+
 GENERATE_EVAL_SIZE = 100
 
 
diff --git a/tests/test_q4_triton.py b/tests/test_q4_triton.py
index 08b0f28fd..8c79121fd 100644
--- a/tests/test_q4_triton.py
+++ b/tests/test_q4_triton.py
@@ -1,15 +1,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear.tritonv2 import TritonV2QuantLinear  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
+
 
 GENERATE_EVAL_SIZE = 100
 
diff --git a/tests/test_quant_batch.py b/tests/test_quant_batch.py
index 8e44308b8..1cd4e44f3 100644
--- a/tests/test_quant_batch.py
+++ b/tests/test_quant_batch.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -8,10 +9,11 @@
 import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization import QuantizeConfig  # noqa: E402
 from gptqmodel.utils import Perplexity  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQuantBatch(unittest.TestCase):
diff --git a/tests/test_quant_formats.py b/tests/test_quant_formats.py
index 24fba00a7..c8f0d87ec 100644
--- a/tests/test_quant_formats.py
+++ b/tests/test_quant_formats.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -10,14 +11,19 @@
 import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
-from gptqmodel.utils.torch import torch_empty_cache # noqa: E402
-from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
-from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
-from gptqmodel.quantization.config import (META_FIELD_QUANTIZER, META_QUANTIZER_GPTQMODEL,  # noqa: E402
-                                           AutoRoundQuantizeConfig, QuantizeConfig)
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, __version__, get_best_device  # noqa: E402
+from gptqmodel.quantization import FORMAT, QUANT_CONFIG_FILENAME, QUANT_METHOD  # noqa: E402
+from gptqmodel.quantization.config import (  # noqa: E402
+    META_FIELD_QUANTIZER,
+    META_QUANTIZER_GPTQMODEL,
+    AutoRoundQuantizeConfig,
+    QuantizeConfig,
+)
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
+
 
 class TestQuantization(unittest.TestCase):
 
diff --git a/tests/test_quant_trust_remote.py b/tests/test_quant_trust_remote.py
index aae8da9ab..81127d131 100644
--- a/tests/test_quant_trust_remote.py
+++ b/tests/test_quant_trust_remote.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -8,9 +9,10 @@
 import unittest  # noqa: E402
 
 from datasets import load_dataset  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import GPTQModel  # noqa: E402
 from gptqmodel.quantization import FORMAT, QuantizeConfig  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
 
 
 class TestQuantWithTrustRemoteTrue(unittest.TestCase):
diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py
index 28730fe70..8a160e025 100644
--- a/tests/test_save_loaded_quantized_model.py
+++ b/tests/test_save_loaded_quantized_model.py
@@ -1,15 +1,18 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
-from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
 from parameterized import parameterized  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel, get_best_device  # noqa: E402
+
+
 MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
 
 class TestSave(unittest.TestCase):
diff --git a/tests/test_serialization.py b/tests/test_serialization.py
index 5aa1d1be7..ffaedca97 100644
--- a/tests/test_serialization.py
+++ b/tests/test_serialization.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_sglang.py b/tests/test_sglang.py
index 59fd8320b..374ab8a57 100644
--- a/tests/test_sglang.py
+++ b/tests/test_sglang.py
@@ -1,5 +1,6 @@
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -9,6 +10,7 @@
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 
 
diff --git a/tests/test_sharded.py b/tests/test_sharded.py
index e8275d8c6..1d2338c13 100644
--- a/tests/test_sharded.py
+++ b/tests/test_sharded.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -9,9 +10,10 @@
 import unittest  # noqa: E402
 
 import torch  # noqa: E402
-from gptqmodel import GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import GPTQModel  # noqa: E402
+
 
 class TestSharded(unittest.TestCase):
     MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
diff --git a/tests/test_tgi.py b/tests/test_tgi.py
index c3ebf8045..d26c51ecc 100644
--- a/tests/test_tgi.py
+++ b/tests/test_tgi.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 import json  # noqa: E402
diff --git a/tests/test_transformers_integration.py b/tests/test_transformers_integration.py
index 6c988e816..2977fb09d 100644
--- a/tests/test_transformers_integration.py
+++ b/tests/test_transformers_integration.py
@@ -1,9 +1,10 @@
 import tempfile
 import unittest
 
-from gptqmodel.integration import integration
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
 
+from gptqmodel.integration import integration
+
 
 class TestTransformersIntegration(unittest.TestCase):
 
diff --git a/tests/test_triton.py b/tests/test_triton.py
index c9a5bf878..da71ea565 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
@@ -9,9 +10,11 @@
 
 import torch  # noqa: E402
 import torch.utils.benchmark as benchmark  # noqa: E402
-from gptqmodel import BACKEND, GPTQModel  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
+from gptqmodel import BACKEND, GPTQModel  # noqa: E402
+
+
 MODEL_ID = "/monster/data/model/Llama-7B-GPTQ"
 DATASET_ID = "timdettmers/openassistant-guanaco"
 LEARNING_RATE = 3e-5
diff --git a/tests/test_verify_hash.py b/tests/test_verify_hash.py
index f49591b27..7c0b246da 100644
--- a/tests/test_verify_hash.py
+++ b/tests/test_verify_hash.py
@@ -1,6 +1,7 @@
 # -- do not touch
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
diff --git a/tests/test_vllm.py b/tests/test_vllm.py
index 449315ec5..ef4e1b1ae 100644
--- a/tests/test_vllm.py
+++ b/tests/test_vllm.py
@@ -1,22 +1,23 @@
 # -- do not touch
 import os
-import tempfile
+
 
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 # -- end do not touch
 
-import gc  # noqa: E402
 import importlib.util  # noqa: E402
 import subprocess  # noqa: E402
 import sys  # noqa: E402
+import tempfile  # noqa: E402
 import unittest  # noqa: E402
 
-import torch
-from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402
-from transformers import AutoTokenizer # noqa: E402
-from gptqmodel.utils.torch import torch_empty_cache # noqa: E402
+import torch  # noqa: E402
 from datasets import load_dataset  # noqa: E402
+from transformers import AutoTokenizer  # noqa: E402
+
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
+from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
+from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
 
 
 class TestLoadVLLM(unittest.TestCase):