Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into zx_support_OVIS
Browse files Browse the repository at this point in the history
  • Loading branch information
ZX-ModelCloud committed Dec 19, 2024
2 parents fe6fce8 + d4f6f08 commit 075aa89
Show file tree
Hide file tree
Showing 85 changed files with 559 additions and 225 deletions.
47 changes: 38 additions & 9 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ env:
MAX_JOBS: 8
RUNNER: 10.0.14.248
TRANSFORMERS_DIFF_TESTS: "models/test_internlm,models/test_internlm2_5,models/test_xverse"
TORCH_2_5_TESTS: "test_q4_ipex.py,test_save_loaded_quantized_model,test_quant_formats,models/test_hymba"
TORCH_2_5_TESTS: "test_q4_ipex.py,test_ipex_xpu.py,test_save_loaded_quantized_model,test_quant_formats,models/test_hymba"
IGNORED_TEST_FILES: "test_tgi.py,test_gptneox.py,models/test_mixtral"
GPTQMODEL_FORCE_BUILD: 1
repo: ${{ github.event.inputs.repo || github.repository }}
Expand Down Expand Up @@ -190,7 +190,9 @@ jobs:
- name: Install requirements
run: |
echo "===== init test env ====="
bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
echo "===== install transformers typing-extensions ====="
uv pip install transformers typing-extensions -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
- name: Compile
Expand Down Expand Up @@ -302,20 +304,24 @@ jobs:

- name: Install wheel
run: |
uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
echo "===== install optimum bitblas ====="
uv pip install optimum bitblas==0.0.1.dev13 uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
echo "===== install dist/whl ====="
uv pip install dist/*.whl
if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
echo "===== install auto_round ====="
uv pip install auto_round
fi
bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
uv pip install typing-extensions numpy==1.26.4 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
if [ "${{ matrix.test_script }}" == "test_cohere2" ]; then
echo "===== install transformers from git ====="
uv pip install -U git+https://github.com/huggingface/transformers.git@5615a393691c81e00251e420c73e4d04c6fe22e5
else
echo "===== install transformers from pypi ====="
uv pip install transformers -U
fi
echo "===== install typing-extensions numpy==1.26.4 ====="
uv pip install typing-extensions numpy==1.26.4 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
- name: Check platform
run: |
Expand Down Expand Up @@ -427,14 +433,20 @@ jobs:

- name: Install wheel
run: |
echo "===== install optimum bitblas parameterized uvicorn ====="
uv pip install optimum bitblas==0.0.1.dev13 parameterized uvicorn -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
echo "===== install dist/whl ====="
uv pip install dist/*.whl
echo "===== init test env ====="
bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.4.1 3.11
echo "===== install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 ====="
uv pip install transformers==4.38.2 typing-extensions numpy==1.26.4 peft==0.13.2 -U -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
if [ "${{ matrix.test_script }}" = "test_xverse" ]; then
echo "===== install tokenizers==0.15.2 ====="
uv pip install tokenizers==0.15.2 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
fi
if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
echo "===== install auto_round ====="
uv pip install auto_round
fi
Expand Down Expand Up @@ -474,8 +486,10 @@ jobs:
runs-on: self-hosted
if: always() && !cancelled() && (needs.build.result == 'success' || github.event.inputs.artifact_id != '') && needs.list-test-files.outputs.torch-2-5-files != '[]'
container:
image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v2-torch2.5.1
image: ${{ needs.check-vm.outputs.ip }}:5000/modelcloud/gptqmodel:github-ci-v3-torch2.5.1
options: --device /dev/dri --ipc=host
volumes:
- /dev/dri/by-path:/dev/dri/by-path
- /home/ci/models:/monster/data/model
strategy:
fail-fast: false
Expand Down Expand Up @@ -533,11 +547,17 @@ jobs:

- name: Install wheel
run: |
bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.5.1 3.11
uv pip install -U intel_extension_for_pytorch typing-extensions bitblas==0.0.1.dev13 -i http://${{ needs.check-vm.outputs.ip }}/simple/ --trusted-host ${{ needs.check-vm.outputs.ip }}
if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
source /etc/profile.d/pyenv.sh && pyenv activate xpu
else
bash -c "$(curl -L http://$RUNNER/files/scripts/init_unit_tests.sh)" @ 12.4 2.5.1 3.11
fi
if [ "${{ matrix.test_script }}" == "test_quant_formats" ] || [ "${{ matrix.test_script }}" == "test_perplexity" ]; then
echo "===== install auto_round ====="
uv pip install auto_round
fi
echo "===== install dist/whl ====="
uv pip install dist/*.whl
- name: Find suitable GPU
Expand All @@ -562,7 +582,16 @@ jobs:
- name: Run tests
if: ${{ (!github.event.inputs.test_names || contains(github.event.inputs.test_names, matrix.test_script)) && !cancelled() }}
run: pytest --durations=0 tests/${{ matrix.test_script }}.py
run: |
if [ "${{ matrix.test_script }}" == "test_ipex_xpu" ]; then
export CUDA_VISIBLE_DEVICES=""
source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
source $ONEAPI_ROOT/../pti/0.9/env/vars.sh
export Pti_DIR=$ONEAPI_ROOT/../pti/0.9/lib/cmake/pti
source /etc/profile.d/pyenv.sh && pyenv activate xpu
pip list
fi
pytest --durations=0 tests/${{ matrix.test_script }}.py
- name: Release GPU
if: always()
Expand Down
20 changes: 13 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,22 @@
</p>

## News
* 12/16/2024 1.4.5-dev: Windows 11 support added/validated. Fix `dynamic` loading.
* 12/15/2024 [1.4.2](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.2): MacOS `gpu` (Metal) and `cpu` (M+) support added/validated for inference and quantization. Cohere 2 model support added.
* 12/13/2024 [1.4.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.1): Added Qwen2-VL model support. `mse` quantization control exposed in `QuantizeConfig`. Monkey patch `patch_vllm()` and `patch_hf()` api added to allow Transformers/Optimum/PEFT and vLLM to correctly loaded GPTQModel quantized models while upstream PRs are in pending status.
* 12/10/2024 [1.4.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.4.0) `EvalPlus` harness integration merged upstream. We now support both `lm-eval` and `EvalPlus`. Added pure torch `Torch` kernel. Refactored `Cuda` kernel to be `DynamicCuda` kernel. `Triton` kernel now auto-padded for max model support. `Dynamic` quantization now supports both positive `+:`:default, and `-:` negative matching which allows matched modules to be skipped entirely for quantization. Fixed auto-`Marlin` kerenl selection. Added auto-kernel fallback for unsupported kernel/module pairs. Lots of internal refractor and cleanup in-preparation for transformers/optimum/peft upstream PR merge. Deprecated the saving of `Marlin` weight format since `Marlin` supports auto conversion of `gptq` format to `Marlin` during runtime.

* 11/29/2024 [1.3.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.3.1) Olmo2 model support. Intel XPU acceleration via IPEX. Model sharding Transformer compat fix due to api deprecation in HF. Removed triton dependency. Triton kernel now optionally dependent on triton pkg.
* 11/26/2024 [1.3.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.3.0) Zero-Day Hymba model support. Removed `tqdm` and `rogue` dependency.
* 11/24/2024 [1.2.3](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.2.3) HF GLM model support. ClearML logging integration. Use `device-smi` and replace `gputil` + `psutil` depends. Fixed model unit tests.
* 11/11/2024 🚀 [1.2.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.2.1) Meta MobileLLM model support added. `lm-eval[gptqmodel]` integration merged upstream. Intel/IPEX cpu inference merged replacing QBits (deprecated). Auto-fix/patch ChatGLM-3/GLM-4 compat with latest transformers. New `.load()` and `.save()` api.
* 10/29/2024 🚀 [1.1.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.1.0) IBM Granite model support. Full auto-buildless wheel install from pypi. Reduce max cpu memory usage by >20% during quantization. 100% CI model/feature coverage.

<details>

<summary>Archived News:</summary>
* 11/11/2024 🚀 [1.2.1](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.2.1) Meta MobileLLM model support added. `lm-eval[gptqmodel]` integration merged upstream. Intel/IPEX cpu inference merged replacing QBits (deprecated). Auto-fix/patch ChatGLM-3/GLM-4 compat with latest transformers. New `.load()` and `.save()` api.

* 10/29/2024 🚀 [1.1.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.1.0) IBM Granite model support. Full auto-buildless wheel install from pypi. Reduce max cpu memory usage by >20% during quantization. 100% CI model/feature coverage.

* 10/12/2024 ✨ [1.0.9](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.0.9) Move AutoRound to optional and fix pip install regression in v1.0.8.

* 10/11/2024 ✨ [1.0.8](https://github.com/ModelCloud/GPTQModel/releases/tag/v1.0.8) Add wheel for python 3.12 and cuda 11.8.
Expand Down Expand Up @@ -61,6 +64,7 @@ Public tests/papers and ModelCloud's internal tests have shown that GPTQ is on-p

## Features
* 🚀 Extensive model support for: `Llama 1-3.3`, `Qwen2-VL`, `Olmo2`, `Hymba`, `GLM`, `IBM Granite`, `Llama 3.2 Vision`, `MiniCPM3`, `GRIN-Moe`, `Phi 1-4`, `EXAONE 3.0`, `InternLM 2.5`, `Gemma 2`, `DeepSeek-V2`, `DeepSeek-V2-Lite`, `ChatGLM`, `MiniCPM`, `Qwen2MoE`, `DBRX`.
* ✨ Linux, MacOS, Windows platform quantization and accelerated inference support.
* 💯 100% CI unit-test coverage for all supported models and kernels including post-quantization quality regression.
*`Dynamic`/Mixed quantization control on a per-module basis. Each layer/module can have a unique quantization config or be excluded from quantization all together.
* 🚀 [vLLM](https://github.com/vllm-project/vllm) and [SGLang](https://github.com/sgl-project/sglang) inference integration for quantized model where format = `FORMAT.GPTQ`
Expand All @@ -79,7 +83,9 @@ Public tests/papers and ModelCloud's internal tests have shown that GPTQ is on-p
## Quality: GPTQModel 4bit can match BF16:
🤗 [ModelCloud quantized ultra-high recovery vortex-series models on HF](https://huggingface.co/collections/ModelCloud/vortex-673743382af0a52b2a8b9fe2)

![image](https://github.com/user-attachments/assets/aab69119-f9c8-4c94-9634-a3c63e57095e)
![image](https://github.com/user-attachments/assets/7b2db012-b8af-4d19-a25d-7023cef19220)



## Model Support: 🚀 (Added by GPTQModel)
| Model | | | | | | | | |
Expand All @@ -96,17 +102,17 @@ Public tests/papers and ModelCloud's internal tests have shown that GPTQ is on-p
| EXAONE 3.0 | 🚀 | InternLM 1/2.5 | 🚀 | OPT || Yi || |


## Kernel and HW Accelerator Support
## Platform and HW Support

GPTQModel is validated for Linux x86_64 with the following devices:
GPTQModel is validated for Linux, MacOS, and Windows 11:

| Platform | Device | | Optimized Arch | Kernels |
|-----------------|---------------| --- | -------------- | -------------- |
| Linux | Nvidia GPU || Ampere or Higher | Marlin, Exllama V2, Exallma V1, Triton, DyanamicCuda, Torch |
| Linux | Intel/AMD CPU || `avx512` or `amx` | IPEX, Torch |
| Linux | Intel XPU || Intel Arc + Datacenter Max | IPEX, Torch |
| MacOS | GPU (Metal) and CPU || M1+ | Torch |

| MacOS | GPU (Metal) / CPU || M1+ | Torch |
| Windows 11 | GPU (Nvidia) / CPU || Nvidia | DynamicCuda, Torch |

## Install

Expand Down
6 changes: 4 additions & 2 deletions examples/benchmark/generation_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@

import torch
from datasets import Dataset, load_dataset
from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.utils.progress import ProgressBar
from transformers import AutoTokenizer, GenerationConfig
from transformers.generation.logits_process import LogitsProcessor

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.utils.progress import ProgressBar


logger = logging.getLogger(__name__)

random.seed(0)
Expand Down
2 changes: 2 additions & 0 deletions examples/benchmark/ipex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


try:
from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
bind_cores_for_best_perf()
Expand All @@ -13,6 +14,7 @@

import argparse


parser = argparse.ArgumentParser(description="Benchmark IPEX vs HF on a pre-trained model.")
parser.add_argument("--model", type=str, required=True, help="Path or name of the pre-trained model.")
parser.add_argument("--cores", type=int, default=8, help="Number of CPU cores to use.")
Expand Down
6 changes: 4 additions & 2 deletions examples/benchmark/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import os

import torch
from gptqmodel.utils import Perplexity
from transformers import AutoTokenizer

from gptqmodel.utils import Perplexity


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

if __name__ == "__main__":
Expand Down Expand Up @@ -51,7 +53,7 @@
tokenizer.pad_token_id = tokenizer.eos_token_id

if args.is_quantized:
from gptqmodel import GPTQModel, BACKEND
from gptqmodel import BACKEND, GPTQModel

model = GPTQModel.load(
args.model_name,
Expand Down
5 changes: 3 additions & 2 deletions examples/evaluation/run_language_modeling_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import datasets
import torch
from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
from gptqmodel.eval_tasks import LanguageModelingTask
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import LanguageModelingTask
from gptqmodel.utils.torch import torch_empty_cache


DATASET = "tatsu-lab/alpaca"
WITH_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nInput:\n{input}\n\nOutput:\n"
WITHOUT_INPUT_TEMPLATE = "Instruction:\n{instruction}\n\nOutput:\n"
Expand Down
5 changes: 3 additions & 2 deletions examples/evaluation/run_sequence_classification_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@

import datasets
import torch
from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
from gptqmodel.eval_tasks import SequenceClassificationTask
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import SequenceClassificationTask
from gptqmodel.utils.torch import torch_empty_cache


DATASET = "cardiffnlp/tweet_sentiment_multilingual"
TEMPLATE = "Question:What's the sentiment of the given text? Choices are {labels}.\nText: {text}\nAnswer:"
ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"}
Expand Down
5 changes: 3 additions & 2 deletions examples/evaluation/run_text_summarization_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@

import datasets
import torch
from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
from gptqmodel.eval_tasks import TextSummarizationTask
from transformers import AutoTokenizer, GenerationConfig

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig
from gptqmodel.eval_tasks import TextSummarizationTask
from gptqmodel.utils.torch import torch_empty_cache


os.system("pip install py7zr")


Expand Down
1 change: 1 addition & 0 deletions examples/inference/run_transformers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
quantized_model = AutoModelForCausalLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ")
print(tokenizer.decode(quantized_model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(quantized_model.device))[0]))
Expand Down
4 changes: 3 additions & 1 deletion examples/inference/run_with_different_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import sys
from argparse import ArgumentParser

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device
from transformers import AutoTokenizer

from gptqmodel import BACKEND, GPTQModel, QuantizeConfig, get_best_device


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
pretrained_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "./TinyLlama/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os

from gptqmodel import GPTQModel, QuantizeConfig, get_best_device
from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig, get_best_device


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage_autoround.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import torch
from transformers import AutoTokenizer

from gptqmodel import GPTQModel
from gptqmodel.quantization.config import AutoRoundQuantizeConfig # noqa: E402
from transformers import AutoTokenizer


pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "./autoround/TinyLlama-1.1B-Chat-v1.0-4bit-128g"
Expand Down
4 changes: 3 additions & 1 deletion examples/quantization/basic_usage_wikitext2.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import torch
from datasets import load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer

from gptqmodel import GPTQModel, QuantizeConfig


pretrained_model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0" # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_id = "TinyLlama-1.1B-Chat-v1.0-4bit-128g"

Expand Down
1 change: 1 addition & 0 deletions examples/quantization/transformers_usage.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig


model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)
dataset = ["gptqmodel is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
Expand Down
2 changes: 1 addition & 1 deletion gptqmodel/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .models import GPTQModel, get_best_device
from .utils import BACKEND
from .quantization import BaseQuantizeConfig, QuantizeConfig
from .utils import BACKEND
from .utils.exllama import exllama_set_max_input_length
from .version import __version__
2 changes: 1 addition & 1 deletion gptqmodel/integration/src/optimum/gptq/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ def tmp(_, input, output):
h.remove()
for name in subset_name_list:
logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
quant_outputs = gptq[name].hf_quantize(
quant_outputs = gptq[name].fasterquant(
percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
)
scale, zero, g_idx = quant_outputs[0], quant_outputs[1], quant_outputs[2]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def validate_environment(self, *args, **kwargs):
)
elif is_gptqmodel_available() and (
version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3")
or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99")
or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.3")
):
raise ImportError("The gptqmodel version should be >= 1.4.3, optimum version should >= 1.24.0")

Expand Down
Loading

0 comments on commit 075aa89

Please sign in to comment.