From 3cd9a8ce512da119a8cd47a50a75a4dee33d83d2 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Fri, 7 Jun 2024 12:45:31 -0400 Subject: [PATCH 01/19] fix save path in llama7b_w4a16_quantization.ipynb (#2321) --- examples/llama7b_w4a16_quantization.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama7b_w4a16_quantization.ipynb b/examples/llama7b_w4a16_quantization.ipynb index 194215891fa..4ee88ff0b05 100644 --- a/examples/llama7b_w4a16_quantization.ipynb +++ b/examples/llama7b_w4a16_quantization.ipynb @@ -153,7 +153,7 @@ "metadata": {}, "outputs": [], "source": [ - "model.save_pretrained(\"/network/sadkins/llama1.1b_W4A16_channel_packed\", save_compressed=True)" + "model.save_pretrained(\"llama1.1b_W4A16_channel_packed\", save_compressed=True)" ] } ], From 934f0d8b9b12845fa9b82fed87d4b54cdfec7a3d Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Mon, 10 Jun 2024 11:39:47 -0400 Subject: [PATCH 02/19] Update Quantization Logging to New Framework (#2313) * use new quant framework for logging * fix legacy compatability * fix --- src/sparseml/pytorch/utils/helpers.py | 31 ++++++------------- .../transformers/finetune/session_mixin.py | 11 +++++-- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/src/sparseml/pytorch/utils/helpers.py b/src/sparseml/pytorch/utils/helpers.py index 4b495afe497..e9c603355de 100644 --- a/src/sparseml/pytorch/utils/helpers.py +++ b/src/sparseml/pytorch/utils/helpers.py @@ -20,7 +20,6 @@ import os import random import re -import warnings from collections import OrderedDict, namedtuple from contextlib import contextmanager from copy import deepcopy @@ -30,7 +29,7 @@ import torch from packaging import version from torch import Tensor -from torch.nn import Linear, Module, Parameter +from torch.nn import Embedding, Linear, Module, Parameter from torch.nn.modules.conv import Conv2d, Conv3d, _ConvNd from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader @@ -780,6 +779,7 @@ def get_prunable_layers(module: Module) -> List[Tuple[str, Module]]: for (name, mod) in module.named_modules() if ( isinstance(mod, Linear) + or isinstance(mod, Embedding) or isinstance(mod, _ConvNd) or (QATLinear and isinstance(mod, QATLinear)) or (QATConv2d and isinstance(mod, QATConv2d)) @@ -793,7 +793,7 @@ def get_quantizable_layers(module: Module) -> List[Tuple[str, Module]]: """ :param module: the module to get the quantizable layers from :return: a list containing the names and modules of the quantizable layers - (Linear, Conv2d, Conv3d) + (Embedding, Linear, Conv2d, Conv3d) """ if QATLinear is None: raise ImportError( @@ -806,6 +806,7 @@ def get_quantizable_layers(module: Module) -> List[Tuple[str, Module]]: for (name, mod) in module.named_modules() if ( isinstance(mod, Linear) + or isinstance(mod, Embedding) or isinstance(mod, Conv2d) or (QATConv3d and isinstance(mod, Conv3d)) ) @@ -816,29 +817,15 @@ def get_quantized_layers(module: Module) -> List[Tuple[str, Module]]: """ :param module: the module to get the quantized layers from :return: a list containing the names and modules of the quantized layers - (Linear, Conv2d, Conv3d) + (Embedding, Linear, Conv2d, Conv3d) """ - if QATLinear is None: - raise ImportError( - "PyTorch version is not setup for Quantization. " - "Please install a QAT compatible version of PyTorch" - ) quantized_layers = [] for (name, mod) in module.named_modules(): - if ( - (QATLinear and isinstance(mod, QATLinear)) - or (QATConv2d and isinstance(mod, QATConv2d)) - or (QATConv3d and isinstance(mod, QATConv3d)) - ): - quantized_layers.append((name, mod)) - - elif isinstance(mod, Conv3d) and not QATConv3d: - warnings.warn( - "Pytorch version is not setup for Conv3D Quantization. " - "Quantization of Conv3D layers will be skipped", - UserWarning, - ) + if hasattr(mod, "quantization_scheme"): + weight_scheme = getattr(mod.quantization_scheme, "weights", None) + if weight_scheme is not None and hasattr(mod, "weight"): + quantized_layers.append((name, mod)) return quantized_layers diff --git a/src/sparseml/transformers/finetune/session_mixin.py b/src/sparseml/transformers/finetune/session_mixin.py index 149b59be7cd..7436261980e 100644 --- a/src/sparseml/transformers/finetune/session_mixin.py +++ b/src/sparseml/transformers/finetune/session_mixin.py @@ -500,15 +500,22 @@ def log_model_sparsification(self): f"Sparsification info for {type(self.model).__name__}: " f"{sparsification_info.params_total} total params. " ) + sparsity_percent_formatted = "{:.2f}".format( + sparsification_info.params_prunable_sparse_percent + ) _LOGGER.info( f"There are {sparsification_info.params_prunable_total} prunable " - f"params which have {sparsification_info.params_prunable_sparse_percent} " + f"params which have {sparsity_percent_formatted}% " "avg sparsity." ) + + quant_percent_formatted = "{:.2f}".format( + sparsification_info.params_quantized_percent + ) _LOGGER.info( f"There are {sparsification_info.params_quantizable} quantizable " f"params, with a quantization percentage of " - f"{sparsification_info.params_quantized_percent}." + f"{quant_percent_formatted}%." ) def _prepare_model_for_fsdp(self): From e255b17765add46053a2669086cbc95b3fff406c Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 11 Jun 2024 15:04:28 -0400 Subject: [PATCH 03/19] Fix for Sparsity Persist (#2323) * fix sparsity persist * helper moved to compressed-tensors --- .../quantization/gptq/utils/gptq_wrapper.py | 43 +++++++++---------- .../obcq/test_mask_structure_preservation.py | 24 +---------- 2 files changed, 21 insertions(+), 46 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py index 73321c0d0aa..ded28b4123b 100644 --- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py +++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py @@ -103,6 +103,14 @@ def fasterprune( W = W.t() W = W.float() + sparsity = tensor_sparsity(W) + preserve_zeros = sparsity >= SPARSITY_THRESHOLD + W_nz_mask = ( + (~torch.isclose(W, torch.zeros(1, device=W.device).float())).float() + if preserve_zeros + else None + ) + tick = time.time() dead = torch.diag(self.H) == 0 @@ -119,17 +127,6 @@ def fasterprune( self.H = torch.linalg.cholesky(self.H, upper=True) Hinv = self.H - sparsity = tensor_sparsity(W) - mask = ( - torch.where( - W == 0, - torch.tensor(1, dtype=torch.bool), - torch.tensor(0, dtype=torch.bool), - ) - if sparsity >= SPARSITY_THRESHOLD - else None - ) - # See section 3.4 of https://arxiv.org/abs/2203.07259 for i1 in range(0, self.columns, blocksize): i2 = min(i1 + blocksize, self.columns) @@ -141,21 +138,13 @@ def fasterprune( Losses1 = torch.zeros_like(W1) Hinv1 = Hinv[i1:i2, i1:i2] - if sparsity >= SPARSITY_THRESHOLD: - tmp = ( - (~mask[:, i1:i2]) - * W1**2 - / (torch.diag(Hinv1).reshape((1, -1))) ** 2 - ) - thresh = torch.sort(tmp.flatten())[0][int(tmp.numel() * sparsity)] - mask1 = tmp <= thresh + if preserve_zeros: + W1_nz_mask = W_nz_mask[:, i1:i2] for i in range(count): w = W1[:, i] d = Hinv1[i, i] q = w.clone() - if sparsity >= SPARSITY_THRESHOLD: - q[mask1[:, i]] = 0 if hasattr(self.layer, "weight_fake_quant"): scale = self.layer.weight_fake_quant.scale @@ -216,13 +205,21 @@ def fasterprune( Losses1[:, i] = (w - q) ** 2 / d**2 err1 = (w - q) / d - W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + w1_err = err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + if preserve_zeros: + W1[:, i:] -= w1_err * W1_nz_mask[:, i:] + else: + W1[:, i:] -= w1_err Err1[:, i] = err1 W[:, i1:i2] = Q1 Losses += torch.sum(Losses1, 1) / 2 - W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) + w_err = Err1.matmul(Hinv[i1:i2, i2:]) + if preserve_zeros: + W[:, i2:] -= w_err * W_nz_mask[:, i2:] + else: + W[:, i2:] -= w_err _LOGGER.info("time %.2f" % (time.time() - tick)) _LOGGER.info("error %.2f" % torch.sum(Losses).item()) diff --git a/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py b/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py index a068c391431..eca6f5d2379 100644 --- a/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py +++ b/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py @@ -19,6 +19,7 @@ import pytest import sparseml +from compressed_tensors.compressors.utils import tensor_follows_mask_structure from parameterized import parameterized_class from tests.testing_utils import parse_params, requires_torch @@ -28,29 +29,6 @@ ) -def tensor_follows_mask_structure(tensor, mask: str = "2:4"): - """ - :param tensor: tensor to check - :param mask: mask structure to check for, in the format "n:m" - :return: True if the tensor follows the mask structure, False otherwise. - Note, some weights can incidentally be zero, so we check for - atleast n zeros in each chunk of size m - """ - import torch - - n, m = tuple(map(int, mask.split(":"))) - # Reshape the tensor into chunks of size m - tensor = tensor.view(-1, m) - - # Count the number of zeros in each chunk - zero_counts = (tensor == 0).sum(dim=1) - - # Check if the number of zeros in each chunk atleast n - # Greater than sign is needed as some weights can incidentally - # be zero - return torch.all(zero_counts >= n) - - @requires_torch @pytest.mark.integration @parameterized_class(parse_params(MASK_STRUCTURE_CONFIGS_DIRECTORY)) From 4e2ad0ac56ab3569aa350e21bed2f13da11b3408 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 12 Jun 2024 12:01:53 -0400 Subject: [PATCH 04/19] [GHA] Update End-to-End Nightly Build Process (#2304) * trigger nightly workflow * update condition * update * update condition * skip actual tests to speed up testing * try true conditions * try agin * try again * clean-up * update condiitions * try again * try again * try fil case * update * try new condition * try again * try again * try again * revert * try new conditions * typo * try again * try dev workflow * try again * update condition * update * try again * test failure case * update * try again * update * try nightly * add publish --------- Co-authored-by: Sara Adkins --- .github/workflows/build-container.yml | 10 ++- .github/workflows/build-nightly.yml | 22 ------ .../workflows/build-wheel-and-container.yml | 39 ++++----- .../publish-nightly-docker-images.yaml | 79 ------------------- .github/workflows/test-nightly.yml | 4 +- ...nternal.yml => test-wheel-and-publish.yml} | 39 ++++++--- 6 files changed, 59 insertions(+), 134 deletions(-) delete mode 100644 .github/workflows/build-nightly.yml delete mode 100644 .github/workflows/publish-nightly-docker-images.yaml rename .github/workflows/{test-wheel-push-to-internal.yml => test-wheel-and-publish.yml} (57%) diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml index 9eda86ae0d0..ae7cc43bc52 100644 --- a/.github/workflows/build-container.yml +++ b/.github/workflows/build-container.yml @@ -53,4 +53,12 @@ jobs: build-args: | BRANCH=${{github.head_ref}} push: true - tags: ghcr.io/neuralmagic/sparseml-dev:${{ inputs.name }} \ No newline at end of file + tags: ghcr.io/neuralmagic/sparseml-dev:${{ inputs.name }} + + - name: Build Nightly Docker Container + if: ${{ inputs.dev == 'false' && inputs.release == 'false'}} + uses: docker/build-push-action@v4 + with: + context: ./docker/containers/docker_nightly + push: true + tags: ghcr.io/neuralmagic/sparseml-nightly:latest, ghcr.io/neuralmagic/sparseml-nightly:${{ steps.date.outputs.date }} \ No newline at end of file diff --git a/.github/workflows/build-nightly.yml b/.github/workflows/build-nightly.yml deleted file mode 100644 index be44d8b863e..00000000000 --- a/.github/workflows/build-nightly.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: build-nightly -run-name: ${{ github.workflow }} is to create nightly wheel file for pypi -on: - push: - branches: - - 'main' - schedule: - - cron: '30 0 * * *' - workflow_dispatch: - - -jobs: - - BUILD-SPARSEML-NIGHTLY: - - uses: ./.github/workflows/util.yml - with: - runs_on: ubuntu-22.04 - run_id: ${{ github.run_id }} - build_type: nightly - testmo_project_id: 9 - secrets: inherit diff --git a/.github/workflows/build-wheel-and-container.yml b/.github/workflows/build-wheel-and-container.yml index 3eaaf674e08..421e227577a 100644 --- a/.github/workflows/build-wheel-and-container.yml +++ b/.github/workflows/build-wheel-and-container.yml @@ -4,15 +4,8 @@ on: types: [opened, synchronize, reopened] branches: - main - - 'release/[0-9]+.[0-9]+' - push: - branches: - - 'release/[0-9]+.[0-9]+' - - main - release: - types: [created, published] schedule: - - cron: '0 0 * * *' + - cron: '0 20 * * *' permissions: id-token: write @@ -23,10 +16,10 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true -# if not dev or release, will create a nightly build +# TODO: do we want to push to nightly everytime we push to main? +# if not dev or release, will create a nightly build; turning off release for now env: - PRODUCTION: ${{ github.event_name == 'schedule' || github.event_name == 'release'}} - RELEASE: ${{ github.event_name =='release' || startsWith(github.base_ref, 'release/') }} + RELEASE: 'false' DEV: ${{ github.base_ref == 'main' && github.event_name == 'pull_request'}} jobs: @@ -42,8 +35,14 @@ jobs: echo "dev=$DEV" >> $GITHUB_OUTPUT echo "release=$RELEASE" >> $GITHUB_OUTPUT - build-wheel-and-push: + test-nightly: needs: set-outputs + if: ${{ needs.set-outputs.outputs.dev == 'false' && needs.set-outputs.outputs.release == 'false'}} + uses: ./.github/workflows/test-nightly.yml + + build-wheel-and-push: + needs: [set-outputs, test-nightly] + if: ${{ always() && needs.set-outputs.outputs.dev == 'false' && needs.test-nightly.result == 'success' || always() && needs.set-outputs.outputs.dev == 'true' && needs.set-outputs.result == 'success' }} uses: ./.github/workflows/build-wheel.yml with: build-label: ubuntu-20.04 @@ -55,22 +54,24 @@ jobs: python: '3.10' secrets: inherit - test-wheel-and-push-internal: - needs: build-wheel-and-push - uses: ./.github/workflows/test-wheel-push-to-internal.yml + test-wheel-and-publish: + needs: [set-outputs, build-wheel-and-push] + if: ${{ always() && !cancelled() && needs.build-wheel-and-push.result == 'success' }} + uses: ./.github/workflows/test-wheel-and-publish.yml with: build-label: ubuntu-20.04 whl: ${{ needs.build-wheel-and-push.outputs.wheel }} python: '3.10' + dev: ${{ needs.set-outputs.outputs.dev }} + release: ${{ needs.set-outputs.outputs.release }} secrets: inherit - # TODO: add nightly and release container build steps once wheel build push - # to production is automated. Removed until then. build-container-and-push: - needs: [set-outputs, test-wheel-and-push-internal] + needs: [test-wheel-and-publish, set-outputs] + if: ${{ always() && !cancelled() && needs.test-wheel-and-publish.result == 'success' }} uses: ./.github/workflows/build-container.yml with: - build-label: k8s-eng-gpu-64G-v100-32G + build-label: k8s-eng-gpu-16G-t4-32G dev: ${{ needs.set-outputs.outputs.dev }} release: ${{ needs.set-outputs.outputs.release }} name: ${{ github.event.number }} diff --git a/.github/workflows/publish-nightly-docker-images.yaml b/.github/workflows/publish-nightly-docker-images.yaml deleted file mode 100644 index 5ca14ac08bc..00000000000 --- a/.github/workflows/publish-nightly-docker-images.yaml +++ /dev/null @@ -1,79 +0,0 @@ -name: Publish Nightly Docker Images - -on: - push: - branches: - - 'main' - schedule: - - cron: '0 1 * * *' - workflow_dispatch: -jobs: - push-nightly-docker-image: - name: Push Version Tagged Nightly Docker Images - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - - steps: - - name: Set up Docker Buildx - id: buildx - uses: docker/setup-buildx-action@v2 - with: - buildkitd-flags: --debug - - - name: Login to Github Packages - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Checkout code - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - - name: Get version tag - id: extract_tag - run: echo "tag=$(date +%Y%m%d)" >> $GITHUB_OUTPUT - - - name: Current Version Name - run: | - echo ${{ steps.extract_tag.outputs.tag }} - - - name: Sparseml-Nightly latest using default cuda 11.1.1 - uses: docker/build-push-action@v2 - with: - context: ./docker - build-args: | - DEPS=all - BRANCH=main - push: true - tags: | - ghcr.io/neuralmagic/sparseml-nightly:latest - - - name: Today's Sparseml-Nightly using default cuda 11.1.1 - uses: docker/build-push-action@v2 - with: - context: ./docker - build-args: | - DEPS=all - BRANCH=main - push: true - tags: | - ghcr.io/neuralmagic/sparseml-nightly:${{ steps.extract_tag.outputs.tag }} - - - name: Today's Sparseml-Nightly Base using default cuda 11.1.1 - uses: docker/build-push-action@v2 - with: - context: ./docker - build-args: | - DEPS=base - BRANCH=main - push: true - tags: | - ghcr.io/neuralmagic/sparseml-nightly:base-${{ steps.extract_tag.outputs.tag }} - - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/test-nightly.yml b/.github/workflows/test-nightly.yml index 8472b6c8134..4fc1c19cd84 100644 --- a/.github/workflows/test-nightly.yml +++ b/.github/workflows/test-nightly.yml @@ -1,8 +1,7 @@ name: Run Nightly Tests on: - schedule: - - cron: '0 20 * * *' workflow_dispatch: + workflow_call: jobs: test-nightly-tests: runs-on: k8s-mle-gpu-12-vcpu-225GB-ram-2-a6000-48G @@ -33,6 +32,5 @@ jobs: run: | pytest tests/sparseml/transformers/obcq -m integration - name: Run finetune tests - if: always() run: | pytest tests/sparseml/transformers/finetune -m integration \ No newline at end of file diff --git a/.github/workflows/test-wheel-push-to-internal.yml b/.github/workflows/test-wheel-and-publish.yml similarity index 57% rename from .github/workflows/test-wheel-push-to-internal.yml rename to .github/workflows/test-wheel-and-publish.yml index 28af2f272e7..e40fa462ded 100644 --- a/.github/workflows/test-wheel-push-to-internal.yml +++ b/.github/workflows/test-wheel-and-publish.yml @@ -1,4 +1,4 @@ -name: Test Wheel and Push to Internal PyPi +name: Test Wheel and Publish on: workflow_call: inputs: @@ -11,9 +11,15 @@ on: required: true python: type: string + dev: + type: string + required: true + release: + type: string + required: true jobs: - test-wheel-and-push-internal: + test-wheel-and-publish: runs-on: ${{ inputs.build-label }} steps: - uses: actions/setup-python@v4 @@ -36,24 +42,37 @@ jobs: filename: ${{ inputs.whl }} dst: dist_s3 - - name: Set Env - run: | - pip3 install virtualenv - virtualenv venv - source venv/bin/activate - - name: Fetch name of whl run: | echo "FILENAME=$(echo dist_s3/*.whl)" >> $GITHUB_ENV - name: Install whl run: | - pip3 install $FILENAME[dev] + pip3 install $FILENAME[dev,onnxruntime,torch,torchvision,transformers] - name: Checkout code uses: actions/checkout@v3 - name: Remove src files and run tests run: | + pwd rm -rf src - make test \ No newline at end of file + make test + + - name: Make directory for wheel + run: | + mkdir dist_s3 + + - name: Pull from s3 + uses: neuralmagic/nm-actions/actions/s3_pull@main + with: + filename: ${{ inputs.whl }} + dst: dist_s3 + + - name: Publish Nightly Wheel + if: ${{ inputs.DEV == 'false' && inputs.RELEASE == 'false'}} + uses: neuralmagic/nm-actions/actions/publish-whl@main + with: + username: ${{ secrets.PYPI_PUBLIC_USER }} + password: ${{ secrets.PYPI_PUBLIC_AUTH }} + whl: ./$FILENAME \ No newline at end of file From 5c1de1c73577b9a4ca3666662a50ccff2c8acd03 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Thu, 13 Jun 2024 16:04:46 -0400 Subject: [PATCH 05/19] udpate llama7b_sparse_quantized example (#2322) * udpate llama7b_sparse_quantized example * one shot llama example * Update examples/llama7b_sparse_quantized/README.md Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> * Fix GPTQ Aliases (#2327) * fix alias application with unit tests * style --------- Co-authored-by: Sara Adkins Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> --- examples/llama7b_one_shot_quantization.md | 50 ++++++++++++ examples/llama7b_sparse_quantized/README.md | 80 +++++++++++++------ .../modifiers/quantization/gptq/base.py | 42 +++++----- .../pruning/sparsegpt/test_pytorch.py | 2 +- .../transformers/gptq/test_oneshot.py | 80 ++++++++++++++----- 5 files changed, 189 insertions(+), 65 deletions(-) create mode 100644 examples/llama7b_one_shot_quantization.md diff --git a/examples/llama7b_one_shot_quantization.md b/examples/llama7b_one_shot_quantization.md new file mode 100644 index 00000000000..d3ee50e1aaf --- /dev/null +++ b/examples/llama7b_one_shot_quantization.md @@ -0,0 +1,50 @@ +# Creating a Quantized Llama Model in One Shot + +Quantizing a model to a lower precision can save on both memory and speed at inference time. +This example demonstrates how to use the SparseML API to quantize a Llama model from 16 bits +to 4 bits and save it to a compressed-tensors format for inference with vLLM. + +## Step 1: Select a model and dataset +For this example, we will use a TinyLlama model and the open platypus dataset, however +these can be swapped out for any huggingface compatible models and datasets + +```python +model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" +dataset = "open_platypus" +``` + +## Step 2: Configure a `GPTQModifier` +Modifiers in sparseml are used to apply optimizations to models. In this example we use a +`GPTQModifier` to apply the GPTQ algorithm to our model. We target all `Linear` layers +for 4-bit weight quantization. These options may be swapped out for any valid `QuantizationScheme`. + +```python +from sparseml.modifiers.quantization.gptq import GPTQModifier + +gptq = GPTQModifier( + targets="Linear", + scheme="W4A16" +) +``` + + +### Step3: One-Shot Compression + +The `oneshot` api applies the created modifier to the target model and dataset. +Setting `save_compressed` to True runs the model through `compressed_tensors` compression +after the quantization is completed. + +```python +from sparseml.transformers import oneshot + +oneshot( + model=model, + dataset=dataset, + recipe=gptq, + save_compressed=True, + output_dir="llama-compressed-example", + overwrite_output_dir=True, + max_seq_length=256, + num_calibration_samples=256, +) +``` diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md index 779696ba599..c96b6e7ca43 100644 --- a/examples/llama7b_sparse_quantized/README.md +++ b/examples/llama7b_sparse_quantized/README.md @@ -1,47 +1,79 @@ # Creating a Sparse Quantized Llama7b Model -The example in this folder runs in multiple stages to create a Llama 7b model with -a 2:4 sparsity pattern and W4A16 post training quantization (PTW). The model is -calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is -required to run this example. +This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and quantized Llama2-7b model. +The model is calibrated and trained with the ultachat200k dataset. +At least 75GB of GPU memory is required to run this example. -## Recipe Summary +Follow the steps below, or to run the example as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py` -The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). It contains 3 stages that are outlined below. +## Step 1: Select a model, dataset, and recipe +In this step, we select which model to use as a baseline for sparsification, a dataset to +use for calibration and finetuning, and a recipe. +Models can reference a local directory, model in the huggingface hub, or in the sparsezoo. -### Stage 1: Sparsification +Datasets can be from a local compatible directory or the huggingface hub. -Runs the SparseGPT one-shot algorithm to prune the model to 50% sparsity with a 2:4 -sparsity pattern. This means that 2 weights out of every group of 4 weights are masked to 0. +Recipes are YAML files that describe how a model should be optimized during or after training. +The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). +It contains instructions to prune the model to 2:4 sparsity, run one epoch of recovery finetuning, +and quantize to 4 bits in one show using GPTQ. -### Stage 2: Finetuning Recovery - -This stage runs a single epoch of training on the ultrachat200k dataset while maintaining -the sparsity mask from stage 1. The purpose of this stage is to recover any accuracy lost -during the sparsification process. +```python +import torch +from sparseml.transformers import SparseAutoModelForCausalLM -### Stage 3: Quantization +model_stub = "zoo:llama2-7b-ultrachat200k_llama2_pretrain-base" +model = SparseAutoModelForCausalLM.from_pretrained( + model_stub, torch_dtype=torch.bfloat16, device_map="auto" +) -Finally, we run the GPTQ one-shot algorithm to quantize all linear weights to 4 bit -channelwise. +dataset = "ultrachat-200k" +splits = {"calibration": "train_gen[:5%]", "train": "train_gen"} -## How to Run +recipe = "2:4_w4a16_recipe.yaml" +``` -We can run the entire staged recipe with one call to SparseML's `apply` pathway. This -will save a checkpoint of the model after each stage. +## Step 2: Run sparsification using `apply` +The `apply` function applies the given recipe to our model and dataset. +The hardcoded kwargs may be altered based on each model's needs. +After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`. + +```python +from sparseml.transformers import apply + +output_dir = "output_llama7b_2:4_w4a16_channel" + +apply( + model=model, + dataset=dataset, + recipe=recipe, + bf16=False, # use full precision for training + output_dir=output_dir, + splits=splits, + max_seq_length=512, + num_calibration_samples=512, + num_train_epochs=0.5, + logging_steps=500, + save_steps=5000, + gradient_checkpointing=True, + learning_rate=0.0001, + lr_scheduler_type="cosine", + warmup_ratio=0.1, +) +``` -```python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py``` -### Compression +### Step 3: Compression The resulting model will be uncompressed. To save a final compressed copy of the model run the following: -``` +```python import torch from sparseml.transformers import SparseAutoModelForCausalLM +compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed" model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16) model.save_pretrained(compressed_output_dir, save_compressed=True) ``` @@ -49,4 +81,4 @@ model.save_pretrained(compressed_output_dir, save_compressed=True) ### Custom Quantization The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. The above recipe (`2:4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. -To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. Group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml` +To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`. diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py index 004fce2ee7a..43bc596d849 100644 --- a/src/sparseml/modifiers/quantization/gptq/base.py +++ b/src/sparseml/modifiers/quantization/gptq/base.py @@ -18,9 +18,9 @@ from pydantic import Field from compressed_tensors.quantization import ( - QuantizationConfig, QuantizationScheme, is_preset_scheme, + preset_name_to_scheme, ) from sparseml.core import Modifier from sparseml.core.factory import ModifierFactory @@ -77,6 +77,7 @@ class GPTQModifier(Modifier): QuantizationScheme except targets, which will be set to the targets parameter set at the modifier level. Can also be set to a dictionary of the format `preset_scheme_name: targets` for example: `W8A8: ['Linear']` for weight 8 bit + or a string of a preset scheme if targets is provided and activation 8 bit quantization on the Linear layers. """ @@ -89,7 +90,7 @@ class GPTQModifier(Modifier): ignore: List[str] = Field(default_factory=list) disable_quantization_observer_epoch: Optional[float] = None num_calibration_steps: Optional[int] = None - scheme: Optional[Dict[str, Any]] = None + scheme: Optional[Union[str, Dict[str, Any]]] = None compressible_layers_: Optional[List] = None quantization_modifier_: Any = None @@ -167,32 +168,33 @@ def _build_quant_modifier(self, framework): if getattr(self, key, False) } + if isinstance(self.targets, str): + self.targets = [self.targets] + if self.scheme is not None: # takes precedence over config_groups - if any(is_preset_scheme(key) for key in self.scheme.keys()): - config_groups = QuantizationConfig( - config_groups=self.scheme - ).config_groups - quant_args["config_groups"] = config_groups - else: - targets = self.targets or ["Linear"] - config_group = QuantizationScheme.model_validate( - {"targets": targets, **self.scheme} - ) - quant_args["config_groups"] = {"config_group_0": config_group} + if isinstance(self.scheme, str) and is_preset_scheme(self.scheme): + # attach targets to scheme + self.scheme = {self.scheme: self.targets} - targets = self.targets or ["Linear"] - config_group = QuantizationScheme.model_validate( - {"targets": targets, **self.scheme} - ) - quant_args["config_groups"] = {"config_group_0": config_group} + quant_args["config_groups"] = {} + for idx, key in enumerate(self.scheme.keys()): + if is_preset_scheme(key): + scheme = preset_name_to_scheme(key, self.scheme[key]) + else: + scheme = QuantizationScheme.model_validate( + {"targets": self.scheme[key], **self.scheme} + ) + + group_name = f"group_{idx}" + quant_args["config_groups"][group_name] = scheme - if "config_groups" not in quant_args: + if "config_groups" not in quant_args or len("config_groups") == 0: default_quant_scheme = QuantizationScheme.default_scheme( targets=self.targets ) - quant_args["config_groups"] = {"config_group_0": default_quant_scheme} + quant_args["config_groups"] = {"group_0": default_quant_scheme} _LOGGER.info(f"Building quantization modifier with args: {quant_args}") vllm_quant_config = {"QuantizationModifier": quant_args} self._build_quant_modifier_from_dict(vllm_quant_config, framework) diff --git a/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py b/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py index 0fcb66eee9c..1b9f365bebf 100644 --- a/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py +++ b/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py @@ -95,7 +95,7 @@ def test_create_default_quant_modifier(self): modifier.on_initialize_structure(testing_harness.get_state()) assert modifier.quantize assert isinstance(modifier.quantization_modifier_, QuantizationModifier) - default_config_group_name = "config_group_0" + default_config_group_name = "group_0" should_be_default_quant_scheme = modifier.quantization_modifier_.config_groups[ default_config_group_name ] diff --git a/tests/sparseml/transformers/gptq/test_oneshot.py b/tests/sparseml/transformers/gptq/test_oneshot.py index c7c14275df1..1d2e28cc303 100644 --- a/tests/sparseml/transformers/gptq/test_oneshot.py +++ b/tests/sparseml/transformers/gptq/test_oneshot.py @@ -16,11 +16,57 @@ import shutil import unittest +from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme +from parameterized import parameterized_class +from sparseml.modifiers.quantization.gptq import GPTQModifier from sparseml.transformers.sparsification.sparse_model import SparseAutoModelForCausalLM from tests.testing_utils import requires_torch +recipe_str = """ +quant_stage: + quant_modifiers: + GPTQModifier: + sequential_update: false + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: "channel" + targets: ["Linear"] +""" + +recipe_modifier_full = GPTQModifier( + ignore=["lm_head"], + sequential_update=False, + config_groups={ + "group_0": QuantizationScheme( + targets=["Linear"], weights=QuantizationArgs(num_bits=4, strategy="channel") + ) + }, +) + +recipe_modifier_shorthand_a = GPTQModifier( + ignore=["lm_head"], sequential_update=False, targets="Linear", scheme="W4A16" +) + +recipe_modifier_shorthand_b = GPTQModifier( + ignore=["lm_head"], sequential_update=False, scheme={"W4A16": ["Linear"]} +) + + @requires_torch +@parameterized_class( + [ + {"recipe": recipe_str}, + {"recipe": recipe_modifier_full}, + {"recipe": recipe_modifier_shorthand_a}, + {"recipe": recipe_modifier_shorthand_b}, + ] +) class TestGPTQOneShotWithFullScheme(unittest.TestCase): def setUp(self): import torch @@ -30,26 +76,6 @@ def setUp(self): self.dataset = "open_platypus" self.device = "cuda:0" if torch.cuda.is_available() else "cpu" - self.recipe = """ - first_stage: - quant_modifiers: - GPTQModifier: - ignore: ["lm_head"] - sequential_update: True - dampening_frac: 0.001 - block_size: 128 - targets: ["Linear"] - scheme: - input_activations: null - output_activations: null - weights: - num_bits: 8 - type: "int" - symmetric: true - strategy: "tensor" - group_size: 128 - """ - def test_oneshot_application(self): from sparseml.transformers import oneshot @@ -68,9 +94,23 @@ def test_oneshot_application(self): # Check that the model is quantized assert model_loaded.quantization_config is not None + # check config is set properly + assert model_loaded.quantization_config.ignore == ["lm_head"] + assert len(model_loaded.quantization_config.config_groups) == 1 + quant_scheme = model_loaded.quantization_config.config_groups["group_0"] + assert isinstance(quant_scheme, QuantizationScheme) + assert quant_scheme.targets == ["Linear"] + weight_args = model_loaded.quantization_config.config_groups["group_0"].weights + assert isinstance(weight_args, QuantizationArgs) + assert weight_args.num_bits == 4 + # Check a specific layer is quantized targetted_linear_layer = model_loaded.transformer.h[0].attn.attention.k_proj assert hasattr(targetted_linear_layer, "quantization_scheme") + # Check lm-head is not quantized + not_targetted = model_loaded.lm_head + assert not hasattr(not_targetted, "quantization_scheme") + def tearDown(self): shutil.rmtree(self.output) From cfe86f4a976cfb9b658ae36de874ba38865ad1d9 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Fri, 14 Jun 2024 14:43:27 -0400 Subject: [PATCH 06/19] limit sparsezoo and deepsparse deps to 1.7 (#2329) --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 5e55ebec249..f64e706c397 100644 --- a/setup.py +++ b/setup.py @@ -56,11 +56,11 @@ "protobuf>=3.12.2,<=3.20.3", "click>=7.1.2,!=8.0.0", # latest version < 8.0 + blocked version with reported bug ] -_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}~={version_nm_deps}"] +_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>={1.7.0}"] _deepsparse_deps = [ - f"{'deepsparse' if is_release else 'deepsparse-nightly'}~={version_nm_deps}" + f"{'deepsparse' if is_release else 'deepsparse-nightly'}>={1.7.0}" ] -_deepsparse_ent_deps = [f"deepsparse-ent~={version_nm_deps}"] +_deepsparse_ent_deps = [f"deepsparse-ent>={1.7.0}"] _onnxruntime_deps = ["onnxruntime>=1.0.0"] _clip_deps = ["open_clip_torch==2.20.0"] From 4a2fcd9b522449befe5a1941ea4da7027d73d1ed Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Fri, 14 Jun 2024 14:57:18 -0400 Subject: [PATCH 07/19] fix f-string formatting in setup.py update (#2330) --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index f64e706c397..a533821deac 100644 --- a/setup.py +++ b/setup.py @@ -56,11 +56,11 @@ "protobuf>=3.12.2,<=3.20.3", "click>=7.1.2,!=8.0.0", # latest version < 8.0 + blocked version with reported bug ] -_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>={1.7.0}"] +_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>=1.7.0"] _deepsparse_deps = [ - f"{'deepsparse' if is_release else 'deepsparse-nightly'}>={1.7.0}" + f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0" ] -_deepsparse_ent_deps = [f"deepsparse-ent>={1.7.0}"] +_deepsparse_ent_deps = ["deepsparse-ent>=1.7.0"] _onnxruntime_deps = ["onnxruntime>=1.0.0"] _clip_deps = ["open_clip_torch==2.20.0"] From 813033e618808d3165349136bfcba2edd7a6a216 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Mon, 17 Jun 2024 10:29:55 -0400 Subject: [PATCH 08/19] style fix setup.py (#2331) --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index a533821deac..928b13ebc9e 100644 --- a/setup.py +++ b/setup.py @@ -57,9 +57,7 @@ "click>=7.1.2,!=8.0.0", # latest version < 8.0 + blocked version with reported bug ] _nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>=1.7.0"] -_deepsparse_deps = [ - f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0" -] +_deepsparse_deps = [f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0"] _deepsparse_ent_deps = ["deepsparse-ent>=1.7.0"] _onnxruntime_deps = ["onnxruntime>=1.0.0"] From 43995039db04b7e6511226fe701f0b31e0cf33c6 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Mon, 17 Jun 2024 10:35:36 -0400 Subject: [PATCH 09/19] Update Test Recipes for Latest Modifier Changes (#2326) * update recipes for new modifier * fixes --------- Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> --- .../transformers/obcq/recipes/quant.yaml | 27 +------------- .../obcq/recipes/quant_and_sparse.yaml | 37 ++++--------------- .../transformers/obcq/recipes/sparse.yaml | 5 +-- .../transformers/obcq/test_obcq_sparsity.py | 4 -- 4 files changed, 10 insertions(+), 63 deletions(-) diff --git a/tests/sparseml/transformers/obcq/recipes/quant.yaml b/tests/sparseml/transformers/obcq/recipes/quant.yaml index f5436b3873f..9c5a6ac6209 100644 --- a/tests/sparseml/transformers/obcq/recipes/quant.yaml +++ b/tests/sparseml/transformers/obcq/recipes/quant.yaml @@ -6,32 +6,7 @@ test_stage: [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"], [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"] ] - LegacyQuantizationModifier: - ignore: - - LlamaRotaryEmbedding - - LlamaRMSNorm - - SiLU - - model.layers.0.mlp.down_proj - - model.layers.1.mlp.down_proj - - model.layers.2.mlp.down_proj - - model.layers.3.mlp.down_proj - - model.layers.4.mlp.down_proj - - model.layers.5.mlp.down_proj - scheme_overrides: - Embedding: - input_activations: null - weights: - num_bits: 8 - symmetric: False GPTQModifier: block_size: 128 sequential_update: False - percdamp: 0.01 - targets: [ - "model.layers.0", - "model.layers.1", - "model.layers.2", - "model.layers.3", - "model.layers.4", - "model.layers.5" - ] \ No newline at end of file + percdamp: 0.01 \ No newline at end of file diff --git a/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml b/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml index 198b32f0e3c..643ba175597 100644 --- a/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml +++ b/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml @@ -1,5 +1,11 @@ test_stage: obcq_modifiers: + SparseGPTModifier: + sparsity: 0.5 + block_size: 128 + sequential_update: False + percdamp: 0.01 + mask_structure: "0:0" SmoothQuantModifier: smoothing_strength: 0.5 mappings: [ @@ -11,13 +17,6 @@ test_stage: - LlamaRotaryEmbedding - LlamaRMSNorm - SiLU - - model.layers.0.mlp.down_proj - - model.layers.1.mlp.down_proj - - model.layers.2.mlp.down_proj - - model.layers.3.mlp.down_proj - - model.layers.4.mlp.down_proj - - model.layers.5.mlp.down_proj - post_oneshot_calibration: True scheme_overrides: Embedding: input_activations: null @@ -27,26 +26,4 @@ test_stage: GPTQModifier: block_size: 128 sequential_update: False - percdamp: 0.01 - targets: [ - "model.layers.0", - "model.layers.1", - "model.layers.2", - "model.layers.3", - "model.layers.4", - "model.layers.5" - ] - SparseGPTModifier: - sparsity: 0.5 - block_size: 128 - sequential_update: False - percdamp: 0.01 - mask_structure: "0:0" - targets: [ - "model.layers.0", - "model.layers.1", - "model.layers.2", - "model.layers.3", - "model.layers.4", - "model.layers.5" - ] \ No newline at end of file + percdamp: 0.01 \ No newline at end of file diff --git a/tests/sparseml/transformers/obcq/recipes/sparse.yaml b/tests/sparseml/transformers/obcq/recipes/sparse.yaml index 70ffc7bf784..4309a066377 100644 --- a/tests/sparseml/transformers/obcq/recipes/sparse.yaml +++ b/tests/sparseml/transformers/obcq/recipes/sparse.yaml @@ -5,9 +5,8 @@ test_stage: block_size: 128 sequential_update: False percdamp: 0.01 - mask_structure: "0:0" targets: [ "model.layers.0", "model.layers.1", - "lm_head" - ] \ No newline at end of file + ] + mask_structure: "0:0" \ No newline at end of file diff --git a/tests/sparseml/transformers/obcq/test_obcq_sparsity.py b/tests/sparseml/transformers/obcq/test_obcq_sparsity.py index c6c80b301aa..d8e25271c9b 100644 --- a/tests/sparseml/transformers/obcq/test_obcq_sparsity.py +++ b/tests/sparseml/transformers/obcq/test_obcq_sparsity.py @@ -60,8 +60,6 @@ def test_sparsities(self): model = get_session_model() - lm_head_sparsity = tensor_sparsity(model.lm_head.weight) - assert math.isclose(lm_head_sparsity.item(), self.sparsity, rel_tol=1e-4) layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight) assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4) layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight) @@ -118,8 +116,6 @@ def test_sparsities_gpu(self): model = get_session_model() - lm_head_sparsity = tensor_sparsity(model.lm_head.weight) - assert math.isclose(lm_head_sparsity.item(), self.sparsity, rel_tol=1e-4) layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight) assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4) layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight) From 50b685854dd4b78ac3c647b13a9fbc7009b5fa2e Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Mon, 17 Jun 2024 14:37:11 -0400 Subject: [PATCH 10/19] restrict numpy <2.0 (#2333) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 928b13ebc9e..7eaaf76ea1f 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ _deps = [ "pyyaml>=5.0.0", - "numpy>=1.17.0", + "numpy>=1.17.0,<2.0", "matplotlib>=3.0.0", "merge-args>=0.1.0", "onnx>=1.5.0,<1.15.0", From bd0d80cc21cd39cf0ba43f785d31a8e0d3aa1674 Mon Sep 17 00:00:00 2001 From: George Date: Mon, 17 Jun 2024 15:16:22 -0400 Subject: [PATCH 11/19] fix wheel build (#2332) * fix wheel build * comment --- setup.py | 19 +++++++++++------ src/sparseml/version.py | 14 ++++++++----- utils/artifacts.py | 46 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 11 deletions(-) create mode 100644 utils/artifacts.py diff --git a/setup.py b/setup.py index 7eaaf76ea1f..413e533aa4c 100644 --- a/setup.py +++ b/setup.py @@ -16,18 +16,25 @@ from typing import Dict, List, Tuple from setuptools import find_packages, setup +from utils.artifacts import get_release_and_version -# default variables to be overwritten by the version.py file -is_release = None -is_dev = None -version = "unknown" -version_major_minor = version +package_path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), "src", "sparseml" +) +( + is_release, + is_dev, + version, + version_major, + version_minor, + version_bug, +) = get_release_and_version(package_path) # load and overwrite version and release info from sparseml package exec(open(os.path.join("src", "sparseml", "version.py")).read()) print(f"loaded version {version} from src/sparseml/version.py") -version_nm_deps = f"{version_major_minor}.0" +version_nm_deps = f"{version_major}.{version_minor}.0" if is_release: _PACKAGE_NAME = "sparseml" diff --git a/src/sparseml/version.py b/src/sparseml/version.py index ffe77da583a..1279a498a4b 100644 --- a/src/sparseml/version.py +++ b/src/sparseml/version.py @@ -16,16 +16,20 @@ Functionality for storing and setting the version info for SparseML """ -from datetime import date - - version_base = "1.8.0" is_release = False # change to True to set the generated version as a release version is_dev = False dev_number = None -def _generate_version(): +def _generate_version( + is_release: bool, + is_dev: bool, + version_base: str, + dev_number: str, +): + from datetime import date + if is_release: return version_base elif is_dev: @@ -45,7 +49,7 @@ def _generate_version(): "version_build", "version_major_minor", ] -__version__ = _generate_version() +__version__ = _generate_version(is_release, is_dev, version_base, dev_number) version = __version__ version_major, version_minor, version_bug, version_build = version.split(".") + ( diff --git a/utils/artifacts.py b/utils/artifacts.py new file mode 100644 index 00000000000..a93bda61122 --- /dev/null +++ b/utils/artifacts.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from typing import Tuple + + +def get_release_and_version(package_path: str) -> Tuple[bool, bool, str, str, str, str]: + """ + Load version and release info from deepsparse package + """ + # deepsparse/src/deepsparse/version.py always exists, default source of truth + version_path = os.path.join(package_path, "version.py") + + # exec() cannot set local variables so need to manually + locals_dict = {} + exec(open(version_path).read(), globals(), locals_dict) + is_release = locals_dict.get("is_release", False) + is_dev = locals_dict.get("is_dev", False) + version = locals_dict.get("version", "unknown") + version_major = locals_dict.get("version_major", "unknown") + version_minor = locals_dict.get("version_minor", "unknown") + version_bug = locals_dict.get("version_bug", "unknown") + + print(f"Loaded version {version} from {version_path}") + + return ( + is_release, + is_dev, + version, + version_major, + version_minor, + version_bug, + ) From e4ba0279411b1c6e3fb73d4015f87ba8fca6baf1 Mon Sep 17 00:00:00 2001 From: dhuangnm <74931910+dhuangnm@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:09:48 -0400 Subject: [PATCH 12/19] use nm-actions for release build (#2335) Co-authored-by: dhuangnm --- .github/workflows/build-release-wheel.yaml | 57 ++++++++++++++++++++++ .github/workflows/build-release.yml | 19 -------- 2 files changed, 57 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/build-release-wheel.yaml delete mode 100644 .github/workflows/build-release.yml diff --git a/.github/workflows/build-release-wheel.yaml b/.github/workflows/build-release-wheel.yaml new file mode 100644 index 00000000000..a5f5a250b83 --- /dev/null +++ b/.github/workflows/build-release-wheel.yaml @@ -0,0 +1,57 @@ +name: Build release wheel + +on: + push: + branches: + - 'release/*' + + workflow_dispatch: + inputs: + gitref: + description: "git tag, commit or branch name for the release" + type: string + required: true + default: 'release/1.8 + +jobs: + build-release-wheel: + runs-on: ubuntu-20.04 + steps: + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.gitref }} + + - name: Build PyPi Wheel + id: build + uses: neuralmagic/nm-actions/actions/pypi_build@main + with: + dev: false + release: true + + - name: Set Env + run: | + pip3 install --upgrade pip && pip3 install --upgrade setuptools + pip3 install virtualenv + virtualenv venv + source venv/bin/activate + + - name: upload whl + uses: actions/upload-artifact@v4 + if: success() || failure() + with: + name: "wheel-sparseml" + path: ${{ steps.build.outputs.whlname }} + retention-days: 7 + + - name: upload tar.gz + uses: actions/upload-artifact@v4 + if: success() || failure() + with: + name: "tar-sparseml" + path: ${{ steps.build.outputs.tarname }} + retention-days: 7 diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml deleted file mode 100644 index 26d9eb31946..00000000000 --- a/.github/workflows/build-release.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: build-release -run-name: ${{ github.workflow }} is to create release wheel file for pypi -on: - push: - branches: - - 'release/[0-9]+.[0-9]+' - workflow_dispatch: - -jobs: - - BUILD-SPARSEML-RELEASE: - - uses: ./.github/workflows/util.yml - with: - runs_on: ubuntu-22.04 - run_id: ${{ github.run_id }} - build_type: release - testmo_project_id: 9 - secrets: inherit From ffa38520af22a20d0b05ede55cae67bc04d18550 Mon Sep 17 00:00:00 2001 From: dhuangnm <74931910+dhuangnm@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:14:13 -0400 Subject: [PATCH 13/19] missing a ' --- .github/workflows/build-release-wheel.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-release-wheel.yaml b/.github/workflows/build-release-wheel.yaml index a5f5a250b83..20457401a6a 100644 --- a/.github/workflows/build-release-wheel.yaml +++ b/.github/workflows/build-release-wheel.yaml @@ -11,7 +11,7 @@ on: description: "git tag, commit or branch name for the release" type: string required: true - default: 'release/1.8 + default: 'release/1.8' jobs: build-release-wheel: From 22cc6be232b6b7b4417d75b1fb7d6af467563e62 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 18 Jun 2024 10:49:55 -0400 Subject: [PATCH 14/19] Sparse Quantization Example Clarification (#2334) * clarify example * cleanup * update examples * update output name --- examples/llama7b_sparse_quantized/README.md | 8 +++++--- examples/llama7b_w8a8_quantization.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md index c96b6e7ca43..f10bb0984ab 100644 --- a/examples/llama7b_sparse_quantized/README.md +++ b/examples/llama7b_sparse_quantized/README.md @@ -4,7 +4,8 @@ This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and qua The model is calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is required to run this example. -Follow the steps below, or to run the example as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py` +Follow the steps below one by one in a code notebook, or run the full example script +as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py` ## Step 1: Select a model, dataset, and recipe In this step, we select which model to use as a baseline for sparsification, a dataset to @@ -36,7 +37,8 @@ recipe = "2:4_w4a16_recipe.yaml" ## Step 2: Run sparsification using `apply` The `apply` function applies the given recipe to our model and dataset. -The hardcoded kwargs may be altered based on each model's needs. +The hardcoded kwargs may be altered based on each model's needs. This code snippet should +be run in the same Python instance as step 1. After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`. ```python @@ -67,7 +69,7 @@ apply( ### Step 3: Compression The resulting model will be uncompressed. To save a final compressed copy of the model -run the following: +run the following in the same Python instance as the previous steps. ```python import torch diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index c894613ffbb..702218f7db7 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -16,12 +16,12 @@ num_bits: 8 type: "int" symmetric: true - strategy: "channel" + strategy: "tensor" input_activations: num_bits: 8 type: "int" symmetric: true - dynamic: True + dynamic: true strategy: "token" targets: ["Linear"] """ @@ -37,7 +37,7 @@ dataset = "ultrachat-200k" # save location of quantized model out -output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed" +output_dir = "./output_llama7b_w8a8_dynamic_compressed" # set dataset config parameters splits = {"calibration": "train_gen[:5%]"} From 9a28cd76d0bf816bfae51ccaa1f9862aff4c40af Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Tue, 18 Jun 2024 16:48:26 -0400 Subject: [PATCH 15/19] update ignore list in llama examples (#2338) --- examples/llama7b_one_shot_quantization.md | 3 ++- .../llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llama7b_one_shot_quantization.md b/examples/llama7b_one_shot_quantization.md index d3ee50e1aaf..af644897065 100644 --- a/examples/llama7b_one_shot_quantization.md +++ b/examples/llama7b_one_shot_quantization.md @@ -23,7 +23,8 @@ from sparseml.modifiers.quantization.gptq import GPTQModifier gptq = GPTQModifier( targets="Linear", - scheme="W4A16" + scheme="W4A16", + ignore=["lm_head"], ) ``` diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml index aeddebb8cb3..6f35f511396 100644 --- a/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml +++ b/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml @@ -23,6 +23,7 @@ quantization_stage: run_type: oneshot quantization_modifiers: GPTQModifier: + ignore: [ "lm_head" ] sequential_update: false config_groups: group_0: From 52245b3faded4c31bf5d3f24526beb11d33355ae Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Wed, 19 Jun 2024 16:02:58 -0400 Subject: [PATCH 16/19] suppress pydantic warning in legacy quantization modifier (#2340) --- src/sparseml/modifiers/quantization_legacy/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sparseml/modifiers/quantization_legacy/base.py b/src/sparseml/modifiers/quantization_legacy/base.py index 9b9f1569f09..27856774c00 100644 --- a/src/sparseml/modifiers/quantization_legacy/base.py +++ b/src/sparseml/modifiers/quantization_legacy/base.py @@ -14,6 +14,8 @@ from typing import Any, Dict, List, Optional +from pydantic import ConfigDict + from sparseml.core import Event, Modifier @@ -81,6 +83,8 @@ class LegacyQuantizationModifier(Modifier): post_oneshot_calibration: Optional[bool] = False strict: bool = True + model_config = ConfigDict(protected_namespaces=()) + def __init__(self, **kwargs): super().__init__(**kwargs) if self.model_fuse_fn_kwargs is None: From c4ec6319c17c728c55fe0dec12ee1d17bd72c325 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 20 Jun 2024 10:44:36 -0400 Subject: [PATCH 17/19] fix uncompressed path (#2339) --- examples/llama7b_sparse_quantized/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md index f10bb0984ab..1a48c01afc6 100644 --- a/examples/llama7b_sparse_quantized/README.md +++ b/examples/llama7b_sparse_quantized/README.md @@ -73,10 +73,12 @@ run the following in the same Python instance as the previous steps. ```python import torch +import os from sparseml.transformers import SparseAutoModelForCausalLM compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed" -model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16) +uncompressed_path = os.path.join(output_dir, "stage_quantization") +model = SparseAutoModelForCausalLM.from_pretrained(uncompressed_path, torch_dtype=torch.bfloat16) model.save_pretrained(compressed_output_dir, save_compressed=True) ``` From 0becc723db5a69e2455330b194f459e6be3a5e90 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 20 Jun 2024 12:55:03 -0400 Subject: [PATCH 18/19] update README memory requirements (#2342) --- examples/llama7b_sparse_quantized/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md index 1a48c01afc6..35183345d9c 100644 --- a/examples/llama7b_sparse_quantized/README.md +++ b/examples/llama7b_sparse_quantized/README.md @@ -2,7 +2,7 @@ This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and quantized Llama2-7b model. The model is calibrated and trained with the ultachat200k dataset. -At least 75GB of GPU memory is required to run this example. +At least 85GB of GPU memory is required to run this example. Follow the steps below one by one in a code notebook, or run the full example script as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py` From f0a369203c4f310f013c52952f533f8a89788464 Mon Sep 17 00:00:00 2001 From: Benjamin Fineran Date: Thu, 20 Jun 2024 15:51:11 -0400 Subject: [PATCH 19/19] suppress pydantic warnings for model_ fields (#2344) --- src/sparseml/exporters/transforms/kv_cache/configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparseml/exporters/transforms/kv_cache/configs.py b/src/sparseml/exporters/transforms/kv_cache/configs.py index 686adf5c7d5..d0135a2c133 100644 --- a/src/sparseml/exporters/transforms/kv_cache/configs.py +++ b/src/sparseml/exporters/transforms/kv_cache/configs.py @@ -84,7 +84,7 @@ class KeyValueCacheConfig(BaseModel): "the kv cache. If this is not provided, no transpose will " "be applied.", ) - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=()) OPT_CONFIG = KeyValueCacheConfig(