From 3cd9a8ce512da119a8cd47a50a75a4dee33d83d2 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Fri, 7 Jun 2024 12:45:31 -0400
Subject: [PATCH 01/19] fix save path in llama7b_w4a16_quantization.ipynb
 (#2321)

---
 examples/llama7b_w4a16_quantization.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama7b_w4a16_quantization.ipynb b/examples/llama7b_w4a16_quantization.ipynb
index 194215891fa..4ee88ff0b05 100644
--- a/examples/llama7b_w4a16_quantization.ipynb
+++ b/examples/llama7b_w4a16_quantization.ipynb
@@ -153,7 +153,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save_pretrained(\"/network/sadkins/llama1.1b_W4A16_channel_packed\", save_compressed=True)"
+    "model.save_pretrained(\"llama1.1b_W4A16_channel_packed\", save_compressed=True)"
    ]
   }
  ],

From 934f0d8b9b12845fa9b82fed87d4b54cdfec7a3d Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 10 Jun 2024 11:39:47 -0400
Subject: [PATCH 02/19] Update Quantization Logging to New Framework (#2313)

* use new quant framework for logging

* fix legacy compatability

* fix
---
 src/sparseml/pytorch/utils/helpers.py         | 31 ++++++-------------
 .../transformers/finetune/session_mixin.py    | 11 +++++--
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/src/sparseml/pytorch/utils/helpers.py b/src/sparseml/pytorch/utils/helpers.py
index 4b495afe497..e9c603355de 100644
--- a/src/sparseml/pytorch/utils/helpers.py
+++ b/src/sparseml/pytorch/utils/helpers.py
@@ -20,7 +20,6 @@
 import os
 import random
 import re
-import warnings
 from collections import OrderedDict, namedtuple
 from contextlib import contextmanager
 from copy import deepcopy
@@ -30,7 +29,7 @@
 import torch
 from packaging import version
 from torch import Tensor
-from torch.nn import Linear, Module, Parameter
+from torch.nn import Embedding, Linear, Module, Parameter
 from torch.nn.modules.conv import Conv2d, Conv3d, _ConvNd
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader
@@ -780,6 +779,7 @@ def get_prunable_layers(module: Module) -> List[Tuple[str, Module]]:
         for (name, mod) in module.named_modules()
         if (
             isinstance(mod, Linear)
+            or isinstance(mod, Embedding)
             or isinstance(mod, _ConvNd)
             or (QATLinear and isinstance(mod, QATLinear))
             or (QATConv2d and isinstance(mod, QATConv2d))
@@ -793,7 +793,7 @@ def get_quantizable_layers(module: Module) -> List[Tuple[str, Module]]:
     """
     :param module: the module to get the quantizable layers from
     :return: a list containing the names and modules of the quantizable layers
-        (Linear, Conv2d, Conv3d)
+        (Embedding, Linear, Conv2d, Conv3d)
     """
     if QATLinear is None:
         raise ImportError(
@@ -806,6 +806,7 @@ def get_quantizable_layers(module: Module) -> List[Tuple[str, Module]]:
         for (name, mod) in module.named_modules()
         if (
             isinstance(mod, Linear)
+            or isinstance(mod, Embedding)
             or isinstance(mod, Conv2d)
             or (QATConv3d and isinstance(mod, Conv3d))
         )
@@ -816,29 +817,15 @@ def get_quantized_layers(module: Module) -> List[Tuple[str, Module]]:
     """
     :param module: the module to get the quantized layers from
     :return: a list containing the names and modules of the quantized layers
-        (Linear, Conv2d, Conv3d)
+        (Embedding, Linear, Conv2d, Conv3d)
     """
-    if QATLinear is None:
-        raise ImportError(
-            "PyTorch version is not setup for Quantization. "
-            "Please install a QAT compatible version of PyTorch"
-        )
 
     quantized_layers = []
     for (name, mod) in module.named_modules():
-        if (
-            (QATLinear and isinstance(mod, QATLinear))
-            or (QATConv2d and isinstance(mod, QATConv2d))
-            or (QATConv3d and isinstance(mod, QATConv3d))
-        ):
-            quantized_layers.append((name, mod))
-
-        elif isinstance(mod, Conv3d) and not QATConv3d:
-            warnings.warn(
-                "Pytorch version is not setup for Conv3D Quantization. "
-                "Quantization of Conv3D layers will be skipped",
-                UserWarning,
-            )
+        if hasattr(mod, "quantization_scheme"):
+            weight_scheme = getattr(mod.quantization_scheme, "weights", None)
+            if weight_scheme is not None and hasattr(mod, "weight"):
+                quantized_layers.append((name, mod))
 
     return quantized_layers
 
diff --git a/src/sparseml/transformers/finetune/session_mixin.py b/src/sparseml/transformers/finetune/session_mixin.py
index 149b59be7cd..7436261980e 100644
--- a/src/sparseml/transformers/finetune/session_mixin.py
+++ b/src/sparseml/transformers/finetune/session_mixin.py
@@ -500,15 +500,22 @@ def log_model_sparsification(self):
             f"Sparsification info for {type(self.model).__name__}: "
             f"{sparsification_info.params_total} total params. "
         )
+        sparsity_percent_formatted = "{:.2f}".format(
+            sparsification_info.params_prunable_sparse_percent
+        )
         _LOGGER.info(
             f"There are {sparsification_info.params_prunable_total} prunable "
-            f"params which have {sparsification_info.params_prunable_sparse_percent} "
+            f"params which have {sparsity_percent_formatted}% "
             "avg sparsity."
         )
+
+        quant_percent_formatted = "{:.2f}".format(
+            sparsification_info.params_quantized_percent
+        )
         _LOGGER.info(
             f"There are {sparsification_info.params_quantizable} quantizable "
             f"params, with a quantization percentage of "
-            f"{sparsification_info.params_quantized_percent}."
+            f"{quant_percent_formatted}%."
         )
 
     def _prepare_model_for_fsdp(self):

From e255b17765add46053a2669086cbc95b3fff406c Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 11 Jun 2024 15:04:28 -0400
Subject: [PATCH 03/19] Fix for Sparsity Persist (#2323)

* fix sparsity persist

* helper moved to compressed-tensors
---
 .../quantization/gptq/utils/gptq_wrapper.py   | 43 +++++++++----------
 .../obcq/test_mask_structure_preservation.py  | 24 +----------
 2 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
index 73321c0d0aa..ded28b4123b 100644
--- a/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
+++ b/src/sparseml/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -103,6 +103,14 @@ def fasterprune(
             W = W.t()
         W = W.float()
 
+        sparsity = tensor_sparsity(W)
+        preserve_zeros = sparsity >= SPARSITY_THRESHOLD
+        W_nz_mask = (
+            (~torch.isclose(W, torch.zeros(1, device=W.device).float())).float()
+            if preserve_zeros
+            else None
+        )
+
         tick = time.time()
 
         dead = torch.diag(self.H) == 0
@@ -119,17 +127,6 @@ def fasterprune(
         self.H = torch.linalg.cholesky(self.H, upper=True)
         Hinv = self.H
 
-        sparsity = tensor_sparsity(W)
-        mask = (
-            torch.where(
-                W == 0,
-                torch.tensor(1, dtype=torch.bool),
-                torch.tensor(0, dtype=torch.bool),
-            )
-            if sparsity >= SPARSITY_THRESHOLD
-            else None
-        )
-
         # See section 3.4 of https://arxiv.org/abs/2203.07259
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)
@@ -141,21 +138,13 @@ def fasterprune(
             Losses1 = torch.zeros_like(W1)
             Hinv1 = Hinv[i1:i2, i1:i2]
 
-            if sparsity >= SPARSITY_THRESHOLD:
-                tmp = (
-                    (~mask[:, i1:i2])
-                    * W1**2
-                    / (torch.diag(Hinv1).reshape((1, -1))) ** 2
-                )
-                thresh = torch.sort(tmp.flatten())[0][int(tmp.numel() * sparsity)]
-                mask1 = tmp <= thresh
+            if preserve_zeros:
+                W1_nz_mask = W_nz_mask[:, i1:i2]
 
             for i in range(count):
                 w = W1[:, i]
                 d = Hinv1[i, i]
                 q = w.clone()
-                if sparsity >= SPARSITY_THRESHOLD:
-                    q[mask1[:, i]] = 0
 
                 if hasattr(self.layer, "weight_fake_quant"):
                     scale = self.layer.weight_fake_quant.scale
@@ -216,13 +205,21 @@ def fasterprune(
                 Losses1[:, i] = (w - q) ** 2 / d**2
 
                 err1 = (w - q) / d
-                W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                w1_err = err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0))
+                if preserve_zeros:
+                    W1[:, i:] -= w1_err * W1_nz_mask[:, i:]
+                else:
+                    W1[:, i:] -= w1_err
                 Err1[:, i] = err1
 
             W[:, i1:i2] = Q1
             Losses += torch.sum(Losses1, 1) / 2
 
-            W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:])
+            w_err = Err1.matmul(Hinv[i1:i2, i2:])
+            if preserve_zeros:
+                W[:, i2:] -= w_err * W_nz_mask[:, i2:]
+            else:
+                W[:, i2:] -= w_err
 
         _LOGGER.info("time %.2f" % (time.time() - tick))
         _LOGGER.info("error %.2f" % torch.sum(Losses).item())
diff --git a/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py b/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py
index a068c391431..eca6f5d2379 100644
--- a/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py
+++ b/tests/sparseml/transformers/obcq/test_mask_structure_preservation.py
@@ -19,6 +19,7 @@
 import pytest
 
 import sparseml
+from compressed_tensors.compressors.utils import tensor_follows_mask_structure
 from parameterized import parameterized_class
 from tests.testing_utils import parse_params, requires_torch
 
@@ -28,29 +29,6 @@
 )
 
 
-def tensor_follows_mask_structure(tensor, mask: str = "2:4"):
-    """
-    :param tensor: tensor to check
-    :param mask: mask structure to check for, in the format "n:m"
-    :return: True if the tensor follows the mask structure, False otherwise.
-        Note, some weights can incidentally be zero, so we check for
-        atleast n zeros in each chunk of size m
-    """
-    import torch
-
-    n, m = tuple(map(int, mask.split(":")))
-    # Reshape the tensor into chunks of size m
-    tensor = tensor.view(-1, m)
-
-    # Count the number of zeros in each chunk
-    zero_counts = (tensor == 0).sum(dim=1)
-
-    # Check if the number of zeros in each chunk atleast n
-    # Greater than sign is needed as some weights can incidentally
-    # be zero
-    return torch.all(zero_counts >= n)
-
-
 @requires_torch
 @pytest.mark.integration
 @parameterized_class(parse_params(MASK_STRUCTURE_CONFIGS_DIRECTORY))

From 4e2ad0ac56ab3569aa350e21bed2f13da11b3408 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 12 Jun 2024 12:01:53 -0400
Subject: [PATCH 04/19] [GHA] Update End-to-End Nightly Build Process (#2304)

* trigger nightly workflow

* update condition

* update

* update condition

* skip actual tests to speed up testing

* try true conditions

* try agin

* try again

* clean-up

* update condiitions

* try again

* try again

* try fil case

* update

* try new condition

* try again

* try again

* try again

* revert

* try new conditions

* typo

* try again

* try dev workflow

* try again

* update condition

* update

* try again

* test failure case

* update

* try again

* update

* try nightly

* add publish

---------

Co-authored-by: Sara Adkins <sara@neuralmagic.com>
---
 .github/workflows/build-container.yml         | 10 ++-
 .github/workflows/build-nightly.yml           | 22 ------
 .../workflows/build-wheel-and-container.yml   | 39 ++++-----
 .../publish-nightly-docker-images.yaml        | 79 -------------------
 .github/workflows/test-nightly.yml            |  4 +-
 ...nternal.yml => test-wheel-and-publish.yml} | 39 ++++++---
 6 files changed, 59 insertions(+), 134 deletions(-)
 delete mode 100644 .github/workflows/build-nightly.yml
 delete mode 100644 .github/workflows/publish-nightly-docker-images.yaml
 rename .github/workflows/{test-wheel-push-to-internal.yml => test-wheel-and-publish.yml} (57%)

diff --git a/.github/workflows/build-container.yml b/.github/workflows/build-container.yml
index 9eda86ae0d0..ae7cc43bc52 100644
--- a/.github/workflows/build-container.yml
+++ b/.github/workflows/build-container.yml
@@ -53,4 +53,12 @@ jobs:
           build-args: |
             BRANCH=${{github.head_ref}}
           push: true
-          tags: ghcr.io/neuralmagic/sparseml-dev:${{ inputs.name }}
\ No newline at end of file
+          tags: ghcr.io/neuralmagic/sparseml-dev:${{ inputs.name }}
+      
+      - name: Build Nightly Docker Container
+        if: ${{ inputs.dev == 'false' && inputs.release == 'false'}}
+        uses: docker/build-push-action@v4
+        with:
+          context: ./docker/containers/docker_nightly
+          push: true
+          tags: ghcr.io/neuralmagic/sparseml-nightly:latest, ghcr.io/neuralmagic/sparseml-nightly:${{ steps.date.outputs.date }}
\ No newline at end of file
diff --git a/.github/workflows/build-nightly.yml b/.github/workflows/build-nightly.yml
deleted file mode 100644
index be44d8b863e..00000000000
--- a/.github/workflows/build-nightly.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: build-nightly
-run-name: ${{ github.workflow }} is to create nightly wheel file for pypi
-on:
-  push:
-    branches:
-      - 'main'
-  schedule:
-    - cron: '30 0 * * *'
-  workflow_dispatch:
-
-
-jobs:
-
-    BUILD-SPARSEML-NIGHTLY:
-
-      uses: ./.github/workflows/util.yml
-      with:
-        runs_on: ubuntu-22.04
-        run_id: ${{ github.run_id }}
-        build_type: nightly
-        testmo_project_id: 9
-      secrets: inherit
diff --git a/.github/workflows/build-wheel-and-container.yml b/.github/workflows/build-wheel-and-container.yml
index 3eaaf674e08..421e227577a 100644
--- a/.github/workflows/build-wheel-and-container.yml
+++ b/.github/workflows/build-wheel-and-container.yml
@@ -4,15 +4,8 @@ on:
     types: [opened, synchronize, reopened]
     branches:
       - main
-      - 'release/[0-9]+.[0-9]+'
-  push:
-    branches:
-      - 'release/[0-9]+.[0-9]+'
-      - main
-  release:
-    types: [created, published]
   schedule:
-    - cron: '0 0 * * *'
+    - cron: '0 20 * * *'
 
 permissions:
   id-token: write
@@ -23,10 +16,10 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
-# if not dev or release, will create a nightly build
+# TODO: do we want to push to nightly everytime we push to main?
+# if not dev or release, will create a nightly build; turning off release for now
 env:
-  PRODUCTION: ${{ github.event_name == 'schedule' || github.event_name == 'release'}}
-  RELEASE: ${{ github.event_name =='release' || startsWith(github.base_ref, 'release/') }}
+  RELEASE: 'false'
   DEV: ${{ github.base_ref == 'main' && github.event_name == 'pull_request'}}
 
 jobs:
@@ -42,8 +35,14 @@ jobs:
           echo "dev=$DEV" >> $GITHUB_OUTPUT
           echo "release=$RELEASE" >> $GITHUB_OUTPUT
 
-  build-wheel-and-push:
+  test-nightly:
     needs: set-outputs
+    if: ${{ needs.set-outputs.outputs.dev  == 'false' && needs.set-outputs.outputs.release  == 'false'}}
+    uses: ./.github/workflows/test-nightly.yml
+
+  build-wheel-and-push:
+    needs: [set-outputs, test-nightly]
+    if: ${{ always() && needs.set-outputs.outputs.dev == 'false' && needs.test-nightly.result == 'success' || always() && needs.set-outputs.outputs.dev  == 'true' && needs.set-outputs.result == 'success' }}
     uses: ./.github/workflows/build-wheel.yml
     with:
       build-label: ubuntu-20.04
@@ -55,22 +54,24 @@ jobs:
       python: '3.10'
     secrets: inherit
 
-  test-wheel-and-push-internal:
-    needs: build-wheel-and-push
-    uses: ./.github/workflows/test-wheel-push-to-internal.yml
+  test-wheel-and-publish:
+    needs: [set-outputs, build-wheel-and-push]
+    if: ${{ always() && !cancelled() && needs.build-wheel-and-push.result == 'success' }}
+    uses: ./.github/workflows/test-wheel-and-publish.yml
     with:
       build-label: ubuntu-20.04
       whl: ${{ needs.build-wheel-and-push.outputs.wheel }}
       python: '3.10'
+      dev: ${{ needs.set-outputs.outputs.dev }}
+      release: ${{ needs.set-outputs.outputs.release  }}
     secrets: inherit
 
-  # TODO: add nightly and release container build steps once wheel build push
-  # to production is automated. Removed until then.
   build-container-and-push:
-    needs: [set-outputs, test-wheel-and-push-internal]
+    needs: [test-wheel-and-publish, set-outputs]
+    if: ${{ always() && !cancelled() && needs.test-wheel-and-publish.result == 'success' }}
     uses: ./.github/workflows/build-container.yml
     with:
-      build-label: k8s-eng-gpu-64G-v100-32G
+      build-label: k8s-eng-gpu-16G-t4-32G
       dev: ${{ needs.set-outputs.outputs.dev }}
       release: ${{ needs.set-outputs.outputs.release  }}
       name: ${{ github.event.number }}
diff --git a/.github/workflows/publish-nightly-docker-images.yaml b/.github/workflows/publish-nightly-docker-images.yaml
deleted file mode 100644
index 5ca14ac08bc..00000000000
--- a/.github/workflows/publish-nightly-docker-images.yaml
+++ /dev/null
@@ -1,79 +0,0 @@
-name: Publish Nightly Docker Images
-
-on:
-  push:
-    branches:
-      - 'main'
-  schedule:
-    - cron: '0 1 * * *'
-  workflow_dispatch:
-jobs:
-  push-nightly-docker-image:
-    name: Push Version Tagged Nightly Docker Images
-    runs-on: ubuntu-latest
-    permissions:
-      contents: read
-      packages: write
-
-    steps:
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v2
-        with:
-          buildkitd-flags: --debug
-
-      - name: Login to Github Packages
-        uses: docker/login-action@v2
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Checkout code
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-
-      - name: Get version tag
-        id: extract_tag
-        run: echo "tag=$(date +%Y%m%d)" >> $GITHUB_OUTPUT
-
-      - name: Current Version Name
-        run: |
-          echo ${{ steps.extract_tag.outputs.tag }}
-
-      - name: Sparseml-Nightly latest using default cuda 11.1.1
-        uses: docker/build-push-action@v2
-        with:
-          context: ./docker
-          build-args: |
-            DEPS=all
-            BRANCH=main
-          push: true
-          tags: |
-            ghcr.io/neuralmagic/sparseml-nightly:latest
-
-      - name: Today's Sparseml-Nightly using default cuda 11.1.1
-        uses: docker/build-push-action@v2
-        with:
-          context: ./docker
-          build-args: |
-            DEPS=all
-            BRANCH=main
-          push: true
-          tags: |
-            ghcr.io/neuralmagic/sparseml-nightly:${{ steps.extract_tag.outputs.tag }}
-
-      - name: Today's Sparseml-Nightly Base using default cuda 11.1.1
-        uses: docker/build-push-action@v2
-        with:
-          context: ./docker
-          build-args: |
-            DEPS=base
-            BRANCH=main
-          push: true
-          tags: |
-            ghcr.io/neuralmagic/sparseml-nightly:base-${{ steps.extract_tag.outputs.tag }}
-
-      - name: Image digest
-        run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/.github/workflows/test-nightly.yml b/.github/workflows/test-nightly.yml
index 8472b6c8134..4fc1c19cd84 100644
--- a/.github/workflows/test-nightly.yml
+++ b/.github/workflows/test-nightly.yml
@@ -1,8 +1,7 @@
 name: Run Nightly Tests
 on:
-  schedule:
-    - cron: '0 20 * * *'
   workflow_dispatch:
+  workflow_call:
 jobs:
   test-nightly-tests:
     runs-on: k8s-mle-gpu-12-vcpu-225GB-ram-2-a6000-48G
@@ -33,6 +32,5 @@ jobs:
         run: |
           pytest tests/sparseml/transformers/obcq -m integration
       - name: Run finetune tests
-        if: always()
         run: |
           pytest tests/sparseml/transformers/finetune -m integration
\ No newline at end of file
diff --git a/.github/workflows/test-wheel-push-to-internal.yml b/.github/workflows/test-wheel-and-publish.yml
similarity index 57%
rename from .github/workflows/test-wheel-push-to-internal.yml
rename to .github/workflows/test-wheel-and-publish.yml
index 28af2f272e7..e40fa462ded 100644
--- a/.github/workflows/test-wheel-push-to-internal.yml
+++ b/.github/workflows/test-wheel-and-publish.yml
@@ -1,4 +1,4 @@
-name: Test Wheel and Push to Internal PyPi
+name: Test Wheel and Publish
 on:
   workflow_call:
     inputs:
@@ -11,9 +11,15 @@ on:
         required: true
       python:
         type: string
+      dev:
+        type: string
+        required: true 
+      release:
+        type: string
+        required: true
 
 jobs:
-  test-wheel-and-push-internal:
+  test-wheel-and-publish:
     runs-on: ${{ inputs.build-label }}
     steps:
     - uses: actions/setup-python@v4
@@ -36,24 +42,37 @@ jobs:
         filename: ${{ inputs.whl }}
         dst: dist_s3
 
-    - name: Set Env
-      run: |
-        pip3 install virtualenv
-        virtualenv venv
-        source venv/bin/activate
-  
     - name: Fetch name of whl
       run: |
           echo "FILENAME=$(echo dist_s3/*.whl)" >> $GITHUB_ENV
 
     - name: Install whl
       run: |
-          pip3 install $FILENAME[dev]
+          pip3 install $FILENAME[dev,onnxruntime,torch,torchvision,transformers]
 
     - name: Checkout code
       uses: actions/checkout@v3
 
     - name: Remove src files and run tests
       run: |
+            pwd
             rm -rf src
-            make test
\ No newline at end of file
+            make test
+
+    - name: Make directory for wheel
+      run: |
+          mkdir dist_s3
+          
+    - name: Pull from s3
+      uses: neuralmagic/nm-actions/actions/s3_pull@main
+      with:
+        filename: ${{ inputs.whl }}
+        dst: dist_s3
+
+    - name: Publish Nightly Wheel
+      if: ${{ inputs.DEV == 'false' && inputs.RELEASE == 'false'}}
+      uses: neuralmagic/nm-actions/actions/publish-whl@main
+      with:
+        username: ${{ secrets.PYPI_PUBLIC_USER }}
+        password: ${{ secrets.PYPI_PUBLIC_AUTH }}
+        whl: ./$FILENAME
\ No newline at end of file

From 5c1de1c73577b9a4ca3666662a50ccff2c8acd03 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Thu, 13 Jun 2024 16:04:46 -0400
Subject: [PATCH 05/19] udpate llama7b_sparse_quantized example (#2322)

* udpate llama7b_sparse_quantized example

* one shot llama example

* Update examples/llama7b_sparse_quantized/README.md

Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>

* Fix GPTQ Aliases (#2327)

* fix alias application with unit tests

* style

---------

Co-authored-by: Sara Adkins <sara@neuralmagic.com>
Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
---
 examples/llama7b_one_shot_quantization.md     | 50 ++++++++++++
 examples/llama7b_sparse_quantized/README.md   | 80 +++++++++++++------
 .../modifiers/quantization/gptq/base.py       | 42 +++++-----
 .../pruning/sparsegpt/test_pytorch.py         |  2 +-
 .../transformers/gptq/test_oneshot.py         | 80 ++++++++++++++-----
 5 files changed, 189 insertions(+), 65 deletions(-)
 create mode 100644 examples/llama7b_one_shot_quantization.md

diff --git a/examples/llama7b_one_shot_quantization.md b/examples/llama7b_one_shot_quantization.md
new file mode 100644
index 00000000000..d3ee50e1aaf
--- /dev/null
+++ b/examples/llama7b_one_shot_quantization.md
@@ -0,0 +1,50 @@
+# Creating a Quantized Llama Model in One Shot
+
+Quantizing a model to a lower precision can save on both memory and speed at inference time.
+This example demonstrates how to use the SparseML API to quantize a Llama model from 16 bits
+to 4 bits and save it to a compressed-tensors format for inference with vLLM.
+
+## Step 1: Select a model and dataset
+For this example, we will use a TinyLlama model and the open platypus dataset, however
+these can be swapped out for any huggingface compatible models and datasets
+
+```python
+model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+dataset = "open_platypus"
+```
+
+## Step 2: Configure a `GPTQModifier`
+Modifiers in sparseml are used to apply optimizations to models. In this example we use a
+`GPTQModifier` to apply the GPTQ algorithm to our model.  We target all `Linear` layers
+for 4-bit weight quantization.  These options may be swapped out for any valid `QuantizationScheme`.
+
+```python
+from sparseml.modifiers.quantization.gptq import GPTQModifier
+
+gptq = GPTQModifier(
+    targets="Linear",
+    scheme="W4A16"
+)
+```
+
+
+### Step3: One-Shot Compression
+
+The `oneshot` api applies the created modifier to the target model and dataset.
+Setting `save_compressed` to True runs the model through `compressed_tensors` compression
+after the quantization is completed.
+
+```python
+from sparseml.transformers import oneshot
+
+oneshot(
+    model=model,
+    dataset=dataset,
+    recipe=gptq,
+    save_compressed=True,
+    output_dir="llama-compressed-example",
+    overwrite_output_dir=True,
+    max_seq_length=256,
+    num_calibration_samples=256,
+)
+```
diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md
index 779696ba599..c96b6e7ca43 100644
--- a/examples/llama7b_sparse_quantized/README.md
+++ b/examples/llama7b_sparse_quantized/README.md
@@ -1,47 +1,79 @@
 # Creating a Sparse Quantized Llama7b Model
 
-The example in this folder runs in multiple stages to create a Llama 7b model with 
-a 2:4 sparsity pattern and W4A16 post training quantization (PTW). The model is 
-calibrated and trained with the ultachat200k dataset. At least 75GB of GPU memory is 
-required to run this example.
+This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and quantized Llama2-7b model.
+The model is calibrated and trained with the ultachat200k dataset.
+At least 75GB of GPU memory is required to run this example.
 
-## Recipe Summary
+Follow the steps below, or to run the example as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py`
 
-The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml). It contains 3 stages that are outlined below.
+## Step 1: Select a model, dataset, and recipe
+In this step, we select which model to use as a baseline for sparsification, a dataset to
+use for calibration and finetuning, and a recipe.
 
+Models can reference a local directory, model in the huggingface hub, or in the sparsezoo.
 
-### Stage 1: Sparsification
+Datasets can be from a local compatible directory or the huggingface hub.
 
-Runs the SparseGPT one-shot algorithm to prune the model to 50% sparsity with a 2:4 
-sparsity pattern. This means that 2 weights out of every group of 4 weights are masked to 0.
+Recipes are YAML files that describe how a model should be optimized during or after training.
+The recipe used for this flow is located in [2:4_w4a16_recipe.yaml](./2:4_w4a16_recipe.yaml).
+It contains instructions to prune the model to 2:4 sparsity, run one epoch of recovery finetuning,
+and quantize to 4 bits in one show using GPTQ.
 
-### Stage 2: Finetuning Recovery
-
-This stage runs a single epoch of training on the ultrachat200k dataset while maintaining 
-the sparsity mask from stage 1. The purpose of this stage is to recover any accuracy lost 
-during the sparsification process.
+```python
+import torch
+from sparseml.transformers import SparseAutoModelForCausalLM
 
-### Stage 3: Quantization
+model_stub = "zoo:llama2-7b-ultrachat200k_llama2_pretrain-base"
+model = SparseAutoModelForCausalLM.from_pretrained(
+    model_stub, torch_dtype=torch.bfloat16, device_map="auto"
+)
 
-Finally, we run the GPTQ one-shot algorithm to quantize all linear weights to 4 bit 
-channelwise.
+dataset = "ultrachat-200k"
+splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
 
-## How to Run
+recipe = "2:4_w4a16_recipe.yaml"
+```
 
-We can run the entire staged recipe with one call to SparseML's `apply` pathway. This 
-will save a checkpoint of the model after each stage.
+## Step 2: Run sparsification using `apply`
+The `apply` function applies the given recipe to our model and dataset.
+The hardcoded kwargs may be altered based on each model's needs.
+After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`.
+
+```python
+from sparseml.transformers import apply
+
+output_dir = "output_llama7b_2:4_w4a16_channel"
+
+apply(
+    model=model,
+    dataset=dataset,
+    recipe=recipe,
+    bf16=False,  # use full precision for training
+    output_dir=output_dir,
+    splits=splits,
+    max_seq_length=512,
+    num_calibration_samples=512,
+    num_train_epochs=0.5,
+    logging_steps=500,
+    save_steps=5000,
+    gradient_checkpointing=True,
+    learning_rate=0.0001,
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.1,
+)
+```
 
-```python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py```
 
-### Compression
+### Step 3: Compression
 
 The resulting model will be uncompressed. To save a final compressed copy of the model 
 run the following:
 
-```
+```python
 import torch
 from sparseml.transformers import SparseAutoModelForCausalLM
 
+compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed"
 model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
 model.save_pretrained(compressed_output_dir, save_compressed=True)
 ```
@@ -49,4 +81,4 @@ model.save_pretrained(compressed_output_dir, save_compressed=True)
 ### Custom Quantization
 The current repo supports multiple quantization techniques configured using a recipe. Supported strategies are `tensor`, `group` and `channel`. 
 The above recipe (`2:4_w4a16_recipe.yaml`) uses channel-wise quantization specified by `strategy: "channel"` in its config group. 
-To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. Group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`
+To use quantize per tensor, change strategy from `channel` to `tensor`. To use group size quantization, change from `channel` to `group` and specify its value, say 128, by including `group_size: 128`. A group size quantization example is shown in `2:4_w4a16_group-128_recipe.yaml`.
diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py
index 004fce2ee7a..43bc596d849 100644
--- a/src/sparseml/modifiers/quantization/gptq/base.py
+++ b/src/sparseml/modifiers/quantization/gptq/base.py
@@ -18,9 +18,9 @@
 from pydantic import Field
 
 from compressed_tensors.quantization import (
-    QuantizationConfig,
     QuantizationScheme,
     is_preset_scheme,
+    preset_name_to_scheme,
 )
 from sparseml.core import Modifier
 from sparseml.core.factory import ModifierFactory
@@ -77,6 +77,7 @@ class GPTQModifier(Modifier):
         QuantizationScheme except targets, which will be set to the targets parameter
         set at the modifier level. Can also be set to a dictionary of the format
         `preset_scheme_name: targets` for example: `W8A8: ['Linear']` for weight 8 bit
+        or a string of a preset scheme if targets is provided
         and activation 8 bit quantization on the Linear layers.
     """
 
@@ -89,7 +90,7 @@ class GPTQModifier(Modifier):
     ignore: List[str] = Field(default_factory=list)
     disable_quantization_observer_epoch: Optional[float] = None
     num_calibration_steps: Optional[int] = None
-    scheme: Optional[Dict[str, Any]] = None
+    scheme: Optional[Union[str, Dict[str, Any]]] = None
     compressible_layers_: Optional[List] = None
     quantization_modifier_: Any = None
 
@@ -167,32 +168,33 @@ def _build_quant_modifier(self, framework):
             if getattr(self, key, False)
         }
 
+        if isinstance(self.targets, str):
+            self.targets = [self.targets]
+
         if self.scheme is not None:
             # takes precedence over config_groups
 
-            if any(is_preset_scheme(key) for key in self.scheme.keys()):
-                config_groups = QuantizationConfig(
-                    config_groups=self.scheme
-                ).config_groups
-                quant_args["config_groups"] = config_groups
-            else:
-                targets = self.targets or ["Linear"]
-                config_group = QuantizationScheme.model_validate(
-                    {"targets": targets, **self.scheme}
-                )
-                quant_args["config_groups"] = {"config_group_0": config_group}
+            if isinstance(self.scheme, str) and is_preset_scheme(self.scheme):
+                # attach targets to scheme
+                self.scheme = {self.scheme: self.targets}
 
-            targets = self.targets or ["Linear"]
-            config_group = QuantizationScheme.model_validate(
-                {"targets": targets, **self.scheme}
-            )
-            quant_args["config_groups"] = {"config_group_0": config_group}
+            quant_args["config_groups"] = {}
+            for idx, key in enumerate(self.scheme.keys()):
+                if is_preset_scheme(key):
+                    scheme = preset_name_to_scheme(key, self.scheme[key])
+                else:
+                    scheme = QuantizationScheme.model_validate(
+                        {"targets": self.scheme[key], **self.scheme}
+                    )
+
+                group_name = f"group_{idx}"
+                quant_args["config_groups"][group_name] = scheme
 
-        if "config_groups" not in quant_args:
+        if "config_groups" not in quant_args or len("config_groups") == 0:
             default_quant_scheme = QuantizationScheme.default_scheme(
                 targets=self.targets
             )
-            quant_args["config_groups"] = {"config_group_0": default_quant_scheme}
+            quant_args["config_groups"] = {"group_0": default_quant_scheme}
         _LOGGER.info(f"Building quantization modifier with args: {quant_args}")
         vllm_quant_config = {"QuantizationModifier": quant_args}
         self._build_quant_modifier_from_dict(vllm_quant_config, framework)
diff --git a/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py b/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py
index 0fcb66eee9c..1b9f365bebf 100644
--- a/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py
+++ b/tests/sparseml/pytorch/modifiers/pruning/sparsegpt/test_pytorch.py
@@ -95,7 +95,7 @@ def test_create_default_quant_modifier(self):
         modifier.on_initialize_structure(testing_harness.get_state())
         assert modifier.quantize
         assert isinstance(modifier.quantization_modifier_, QuantizationModifier)
-        default_config_group_name = "config_group_0"
+        default_config_group_name = "group_0"
         should_be_default_quant_scheme = modifier.quantization_modifier_.config_groups[
             default_config_group_name
         ]
diff --git a/tests/sparseml/transformers/gptq/test_oneshot.py b/tests/sparseml/transformers/gptq/test_oneshot.py
index c7c14275df1..1d2e28cc303 100644
--- a/tests/sparseml/transformers/gptq/test_oneshot.py
+++ b/tests/sparseml/transformers/gptq/test_oneshot.py
@@ -16,11 +16,57 @@
 import shutil
 import unittest
 
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
+from parameterized import parameterized_class
+from sparseml.modifiers.quantization.gptq import GPTQModifier
 from sparseml.transformers.sparsification.sparse_model import SparseAutoModelForCausalLM
 from tests.testing_utils import requires_torch
 
 
+recipe_str = """
+quant_stage:
+    quant_modifiers:
+        GPTQModifier:
+            sequential_update: false
+            ignore: ["lm_head"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 4
+                        type: "int"
+                        symmetric: true
+                        strategy: "channel"
+                    targets: ["Linear"]
+"""
+
+recipe_modifier_full = GPTQModifier(
+    ignore=["lm_head"],
+    sequential_update=False,
+    config_groups={
+        "group_0": QuantizationScheme(
+            targets=["Linear"], weights=QuantizationArgs(num_bits=4, strategy="channel")
+        )
+    },
+)
+
+recipe_modifier_shorthand_a = GPTQModifier(
+    ignore=["lm_head"], sequential_update=False, targets="Linear", scheme="W4A16"
+)
+
+recipe_modifier_shorthand_b = GPTQModifier(
+    ignore=["lm_head"], sequential_update=False, scheme={"W4A16": ["Linear"]}
+)
+
+
 @requires_torch
+@parameterized_class(
+    [
+        {"recipe": recipe_str},
+        {"recipe": recipe_modifier_full},
+        {"recipe": recipe_modifier_shorthand_a},
+        {"recipe": recipe_modifier_shorthand_b},
+    ]
+)
 class TestGPTQOneShotWithFullScheme(unittest.TestCase):
     def setUp(self):
         import torch
@@ -30,26 +76,6 @@ def setUp(self):
         self.dataset = "open_platypus"
         self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
-        self.recipe = """
-        first_stage:
-            quant_modifiers:
-                    GPTQModifier:
-                        ignore: ["lm_head"]
-                        sequential_update: True
-                        dampening_frac: 0.001
-                        block_size: 128
-                        targets: ["Linear"]
-                        scheme:
-                            input_activations: null
-                            output_activations: null
-                            weights:
-                                num_bits: 8
-                                type: "int"
-                                symmetric: true
-                                strategy: "tensor"
-                                group_size: 128
-        """
-
     def test_oneshot_application(self):
         from sparseml.transformers import oneshot
 
@@ -68,9 +94,23 @@ def test_oneshot_application(self):
         # Check that the model is quantized
         assert model_loaded.quantization_config is not None
 
+        # check config is set properly
+        assert model_loaded.quantization_config.ignore == ["lm_head"]
+        assert len(model_loaded.quantization_config.config_groups) == 1
+        quant_scheme = model_loaded.quantization_config.config_groups["group_0"]
+        assert isinstance(quant_scheme, QuantizationScheme)
+        assert quant_scheme.targets == ["Linear"]
+        weight_args = model_loaded.quantization_config.config_groups["group_0"].weights
+        assert isinstance(weight_args, QuantizationArgs)
+        assert weight_args.num_bits == 4
+
         # Check a specific layer is quantized
         targetted_linear_layer = model_loaded.transformer.h[0].attn.attention.k_proj
         assert hasattr(targetted_linear_layer, "quantization_scheme")
 
+        # Check lm-head is not quantized
+        not_targetted = model_loaded.lm_head
+        assert not hasattr(not_targetted, "quantization_scheme")
+
     def tearDown(self):
         shutil.rmtree(self.output)

From cfe86f4a976cfb9b658ae36de874ba38865ad1d9 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Fri, 14 Jun 2024 14:43:27 -0400
Subject: [PATCH 06/19] limit sparsezoo and deepsparse deps to 1.7 (#2329)

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 5e55ebec249..f64e706c397 100644
--- a/setup.py
+++ b/setup.py
@@ -56,11 +56,11 @@
     "protobuf>=3.12.2,<=3.20.3",
     "click>=7.1.2,!=8.0.0",  # latest version < 8.0 + blocked version with reported bug
 ]
-_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}~={version_nm_deps}"]
+_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>={1.7.0}"]
 _deepsparse_deps = [
-    f"{'deepsparse' if is_release else 'deepsparse-nightly'}~={version_nm_deps}"
+    f"{'deepsparse' if is_release else 'deepsparse-nightly'}>={1.7.0}"
 ]
-_deepsparse_ent_deps = [f"deepsparse-ent~={version_nm_deps}"]
+_deepsparse_ent_deps = [f"deepsparse-ent>={1.7.0}"]
 
 _onnxruntime_deps = ["onnxruntime>=1.0.0"]
 _clip_deps = ["open_clip_torch==2.20.0"]

From 4a2fcd9b522449befe5a1941ea4da7027d73d1ed Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Fri, 14 Jun 2024 14:57:18 -0400
Subject: [PATCH 07/19] fix f-string formatting in setup.py update (#2330)

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index f64e706c397..a533821deac 100644
--- a/setup.py
+++ b/setup.py
@@ -56,11 +56,11 @@
     "protobuf>=3.12.2,<=3.20.3",
     "click>=7.1.2,!=8.0.0",  # latest version < 8.0 + blocked version with reported bug
 ]
-_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>={1.7.0}"]
+_nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>=1.7.0"]
 _deepsparse_deps = [
-    f"{'deepsparse' if is_release else 'deepsparse-nightly'}>={1.7.0}"
+    f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0"
 ]
-_deepsparse_ent_deps = [f"deepsparse-ent>={1.7.0}"]
+_deepsparse_ent_deps = ["deepsparse-ent>=1.7.0"]
 
 _onnxruntime_deps = ["onnxruntime>=1.0.0"]
 _clip_deps = ["open_clip_torch==2.20.0"]

From 813033e618808d3165349136bfcba2edd7a6a216 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Mon, 17 Jun 2024 10:29:55 -0400
Subject: [PATCH 08/19] style fix setup.py (#2331)

---
 setup.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index a533821deac..928b13ebc9e 100644
--- a/setup.py
+++ b/setup.py
@@ -57,9 +57,7 @@
     "click>=7.1.2,!=8.0.0",  # latest version < 8.0 + blocked version with reported bug
 ]
 _nm_deps = [f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}>=1.7.0"]
-_deepsparse_deps = [
-    f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0"
-]
+_deepsparse_deps = [f"{'deepsparse' if is_release else 'deepsparse-nightly'}>=1.7.0"]
 _deepsparse_ent_deps = ["deepsparse-ent>=1.7.0"]
 
 _onnxruntime_deps = ["onnxruntime>=1.0.0"]

From 43995039db04b7e6511226fe701f0b31e0cf33c6 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 17 Jun 2024 10:35:36 -0400
Subject: [PATCH 09/19] Update Test Recipes for Latest Modifier Changes (#2326)

* update recipes for new modifier

* fixes

---------

Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
---
 .../transformers/obcq/recipes/quant.yaml      | 27 +-------------
 .../obcq/recipes/quant_and_sparse.yaml        | 37 ++++---------------
 .../transformers/obcq/recipes/sparse.yaml     |  5 +--
 .../transformers/obcq/test_obcq_sparsity.py   |  4 --
 4 files changed, 10 insertions(+), 63 deletions(-)

diff --git a/tests/sparseml/transformers/obcq/recipes/quant.yaml b/tests/sparseml/transformers/obcq/recipes/quant.yaml
index f5436b3873f..9c5a6ac6209 100644
--- a/tests/sparseml/transformers/obcq/recipes/quant.yaml
+++ b/tests/sparseml/transformers/obcq/recipes/quant.yaml
@@ -6,32 +6,7 @@ test_stage:
         [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
         [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
       ]
-    LegacyQuantizationModifier:
-      ignore:
-        - LlamaRotaryEmbedding
-        - LlamaRMSNorm
-        - SiLU
-        - model.layers.0.mlp.down_proj
-        - model.layers.1.mlp.down_proj
-        - model.layers.2.mlp.down_proj
-        - model.layers.3.mlp.down_proj
-        - model.layers.4.mlp.down_proj
-        - model.layers.5.mlp.down_proj
-      scheme_overrides:
-        Embedding:
-          input_activations: null
-          weights:
-            num_bits: 8
-            symmetric: False
     GPTQModifier:
       block_size: 128
       sequential_update: False
-      percdamp: 0.01
-      targets: [
-        "model.layers.0",
-        "model.layers.1",
-        "model.layers.2",
-        "model.layers.3",
-        "model.layers.4",
-        "model.layers.5"
-      ]  
\ No newline at end of file
+      percdamp: 0.01
\ No newline at end of file
diff --git a/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml b/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml
index 198b32f0e3c..643ba175597 100644
--- a/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml
+++ b/tests/sparseml/transformers/obcq/recipes/quant_and_sparse.yaml
@@ -1,5 +1,11 @@
 test_stage:
   obcq_modifiers:
+    SparseGPTModifier:
+      sparsity: 0.5
+      block_size: 128
+      sequential_update: False
+      percdamp: 0.01
+      mask_structure: "0:0"
     SmoothQuantModifier:
       smoothing_strength: 0.5
       mappings: [
@@ -11,13 +17,6 @@ test_stage:
         - LlamaRotaryEmbedding
         - LlamaRMSNorm
         - SiLU
-        - model.layers.0.mlp.down_proj
-        - model.layers.1.mlp.down_proj
-        - model.layers.2.mlp.down_proj
-        - model.layers.3.mlp.down_proj
-        - model.layers.4.mlp.down_proj
-        - model.layers.5.mlp.down_proj
-      post_oneshot_calibration: True
       scheme_overrides:
         Embedding:
           input_activations: null
@@ -27,26 +26,4 @@ test_stage:
     GPTQModifier:
       block_size: 128
       sequential_update: False
-      percdamp: 0.01
-      targets: [
-        "model.layers.0",
-        "model.layers.1",
-        "model.layers.2",
-        "model.layers.3",
-        "model.layers.4",
-        "model.layers.5"
-      ]
-    SparseGPTModifier:
-      sparsity: 0.5
-      block_size: 128
-      sequential_update: False
-      percdamp: 0.01
-      mask_structure: "0:0"
-      targets: [
-        "model.layers.0",
-        "model.layers.1",
-        "model.layers.2",
-        "model.layers.3",
-        "model.layers.4",
-        "model.layers.5"
-      ]
\ No newline at end of file
+      percdamp: 0.01
\ No newline at end of file
diff --git a/tests/sparseml/transformers/obcq/recipes/sparse.yaml b/tests/sparseml/transformers/obcq/recipes/sparse.yaml
index 70ffc7bf784..4309a066377 100644
--- a/tests/sparseml/transformers/obcq/recipes/sparse.yaml
+++ b/tests/sparseml/transformers/obcq/recipes/sparse.yaml
@@ -5,9 +5,8 @@ test_stage:
       block_size: 128
       sequential_update: False
       percdamp: 0.01
-      mask_structure: "0:0"
       targets: [
         "model.layers.0",
         "model.layers.1",
-        "lm_head"
-      ]
\ No newline at end of file
+      ]
+      mask_structure: "0:0"
\ No newline at end of file
diff --git a/tests/sparseml/transformers/obcq/test_obcq_sparsity.py b/tests/sparseml/transformers/obcq/test_obcq_sparsity.py
index c6c80b301aa..d8e25271c9b 100644
--- a/tests/sparseml/transformers/obcq/test_obcq_sparsity.py
+++ b/tests/sparseml/transformers/obcq/test_obcq_sparsity.py
@@ -60,8 +60,6 @@ def test_sparsities(self):
 
         model = get_session_model()
 
-        lm_head_sparsity = tensor_sparsity(model.lm_head.weight)
-        assert math.isclose(lm_head_sparsity.item(), self.sparsity, rel_tol=1e-4)
         layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight)
         assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4)
         layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight)
@@ -118,8 +116,6 @@ def test_sparsities_gpu(self):
 
         model = get_session_model()
 
-        lm_head_sparsity = tensor_sparsity(model.lm_head.weight)
-        assert math.isclose(lm_head_sparsity.item(), self.sparsity, rel_tol=1e-4)
         layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight)
         assert math.isclose(layer_1_sparse.item(), self.sparsity, rel_tol=1e-4)
         layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight)

From 50b685854dd4b78ac3c647b13a9fbc7009b5fa2e Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Mon, 17 Jun 2024 14:37:11 -0400
Subject: [PATCH 10/19] restrict numpy <2.0 (#2333)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 928b13ebc9e..7eaaf76ea1f 100644
--- a/setup.py
+++ b/setup.py
@@ -38,7 +38,7 @@
 
 _deps = [
     "pyyaml>=5.0.0",
-    "numpy>=1.17.0",
+    "numpy>=1.17.0,<2.0",
     "matplotlib>=3.0.0",
     "merge-args>=0.1.0",
     "onnx>=1.5.0,<1.15.0",

From bd0d80cc21cd39cf0ba43f785d31a8e0d3aa1674 Mon Sep 17 00:00:00 2001
From: George <george@neuralmagic.com>
Date: Mon, 17 Jun 2024 15:16:22 -0400
Subject: [PATCH 11/19] fix wheel build (#2332)

* fix wheel build

* comment
---
 setup.py                | 19 +++++++++++------
 src/sparseml/version.py | 14 ++++++++-----
 utils/artifacts.py      | 46 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 11 deletions(-)
 create mode 100644 utils/artifacts.py

diff --git a/setup.py b/setup.py
index 7eaaf76ea1f..413e533aa4c 100644
--- a/setup.py
+++ b/setup.py
@@ -16,18 +16,25 @@
 from typing import Dict, List, Tuple
 
 from setuptools import find_packages, setup
+from utils.artifacts import get_release_and_version
 
 
-# default variables to be overwritten by the version.py file
-is_release = None
-is_dev = None
-version = "unknown"
-version_major_minor = version
+package_path = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "src", "sparseml"
+)
+(
+    is_release,
+    is_dev,
+    version,
+    version_major,
+    version_minor,
+    version_bug,
+) = get_release_and_version(package_path)
 
 # load and overwrite version and release info from sparseml package
 exec(open(os.path.join("src", "sparseml", "version.py")).read())
 print(f"loaded version {version} from src/sparseml/version.py")
-version_nm_deps = f"{version_major_minor}.0"
+version_nm_deps = f"{version_major}.{version_minor}.0"
 
 if is_release:
     _PACKAGE_NAME = "sparseml"
diff --git a/src/sparseml/version.py b/src/sparseml/version.py
index ffe77da583a..1279a498a4b 100644
--- a/src/sparseml/version.py
+++ b/src/sparseml/version.py
@@ -16,16 +16,20 @@
 Functionality for storing and setting the version info for SparseML
 """
 
-from datetime import date
-
-
 version_base = "1.8.0"
 is_release = False  # change to True to set the generated version as a release version
 is_dev = False
 dev_number = None
 
 
-def _generate_version():
+def _generate_version(
+    is_release: bool,
+    is_dev: bool,
+    version_base: str,
+    dev_number: str,
+):
+    from datetime import date
+
     if is_release:
         return version_base
     elif is_dev:
@@ -45,7 +49,7 @@ def _generate_version():
     "version_build",
     "version_major_minor",
 ]
-__version__ = _generate_version()
+__version__ = _generate_version(is_release, is_dev, version_base, dev_number)
 
 version = __version__
 version_major, version_minor, version_bug, version_build = version.split(".") + (
diff --git a/utils/artifacts.py b/utils/artifacts.py
new file mode 100644
index 00000000000..a93bda61122
--- /dev/null
+++ b/utils/artifacts.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+from typing import Tuple
+
+
+def get_release_and_version(package_path: str) -> Tuple[bool, bool, str, str, str, str]:
+    """
+    Load version and release info from deepsparse package
+    """
+    # deepsparse/src/deepsparse/version.py always exists, default source of truth
+    version_path = os.path.join(package_path, "version.py")
+
+    # exec() cannot set local variables so need to manually
+    locals_dict = {}
+    exec(open(version_path).read(), globals(), locals_dict)
+    is_release = locals_dict.get("is_release", False)
+    is_dev = locals_dict.get("is_dev", False)
+    version = locals_dict.get("version", "unknown")
+    version_major = locals_dict.get("version_major", "unknown")
+    version_minor = locals_dict.get("version_minor", "unknown")
+    version_bug = locals_dict.get("version_bug", "unknown")
+
+    print(f"Loaded version {version} from {version_path}")
+
+    return (
+        is_release,
+        is_dev,
+        version,
+        version_major,
+        version_minor,
+        version_bug,
+    )

From e4ba0279411b1c6e3fb73d4015f87ba8fca6baf1 Mon Sep 17 00:00:00 2001
From: dhuangnm <74931910+dhuangnm@users.noreply.github.com>
Date: Tue, 18 Jun 2024 09:09:48 -0400
Subject: [PATCH 12/19] use nm-actions for release build (#2335)

Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
---
 .github/workflows/build-release-wheel.yaml | 57 ++++++++++++++++++++++
 .github/workflows/build-release.yml        | 19 --------
 2 files changed, 57 insertions(+), 19 deletions(-)
 create mode 100644 .github/workflows/build-release-wheel.yaml
 delete mode 100644 .github/workflows/build-release.yml

diff --git a/.github/workflows/build-release-wheel.yaml b/.github/workflows/build-release-wheel.yaml
new file mode 100644
index 00000000000..a5f5a250b83
--- /dev/null
+++ b/.github/workflows/build-release-wheel.yaml
@@ -0,0 +1,57 @@
+name: Build release wheel
+
+on:
+  push:
+    branches:
+      - 'release/*'
+
+  workflow_dispatch:
+    inputs:
+      gitref:
+        description: "git tag, commit or branch name for the release"
+        type: string
+        required: true
+        default: 'release/1.8
+
+jobs:
+  build-release-wheel:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+          
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.gitref }}
+      
+      - name: Build PyPi Wheel
+        id: build
+        uses: neuralmagic/nm-actions/actions/pypi_build@main
+        with:
+          dev: false
+          release: true
+
+      - name: Set Env
+        run: |
+          pip3 install --upgrade pip && pip3 install --upgrade setuptools
+          pip3 install virtualenv
+          virtualenv venv
+          source venv/bin/activate
+
+      - name: upload whl
+        uses: actions/upload-artifact@v4
+        if: success() || failure()
+        with:
+          name: "wheel-sparseml"
+          path: ${{ steps.build.outputs.whlname }}
+          retention-days: 7
+
+      - name: upload tar.gz
+        uses: actions/upload-artifact@v4
+        if: success() || failure()
+        with:
+          name: "tar-sparseml"
+          path: ${{ steps.build.outputs.tarname }}
+          retention-days: 7
diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
deleted file mode 100644
index 26d9eb31946..00000000000
--- a/.github/workflows/build-release.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: build-release
-run-name: ${{ github.workflow }} is to create release wheel file for pypi
-on:
-  push:
-    branches:
-      - 'release/[0-9]+.[0-9]+'
-  workflow_dispatch:
-
-jobs:
-
-    BUILD-SPARSEML-RELEASE:
-
-      uses: ./.github/workflows/util.yml
-      with:
-        runs_on: ubuntu-22.04
-        run_id: ${{ github.run_id }}
-        build_type: release
-        testmo_project_id: 9
-      secrets: inherit

From ffa38520af22a20d0b05ede55cae67bc04d18550 Mon Sep 17 00:00:00 2001
From: dhuangnm <74931910+dhuangnm@users.noreply.github.com>
Date: Tue, 18 Jun 2024 09:14:13 -0400
Subject: [PATCH 13/19] missing a '

---
 .github/workflows/build-release-wheel.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-release-wheel.yaml b/.github/workflows/build-release-wheel.yaml
index a5f5a250b83..20457401a6a 100644
--- a/.github/workflows/build-release-wheel.yaml
+++ b/.github/workflows/build-release-wheel.yaml
@@ -11,7 +11,7 @@ on:
         description: "git tag, commit or branch name for the release"
         type: string
         required: true
-        default: 'release/1.8
+        default: 'release/1.8'
 
 jobs:
   build-release-wheel:

From 22cc6be232b6b7b4417d75b1fb7d6af467563e62 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 18 Jun 2024 10:49:55 -0400
Subject: [PATCH 14/19] Sparse Quantization Example Clarification (#2334)

* clarify example

* cleanup

* update examples

* update output name
---
 examples/llama7b_sparse_quantized/README.md | 8 +++++---
 examples/llama7b_w8a8_quantization.py       | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md
index c96b6e7ca43..f10bb0984ab 100644
--- a/examples/llama7b_sparse_quantized/README.md
+++ b/examples/llama7b_sparse_quantized/README.md
@@ -4,7 +4,8 @@ This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and qua
 The model is calibrated and trained with the ultachat200k dataset.
 At least 75GB of GPU memory is required to run this example.
 
-Follow the steps below, or to run the example as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py`
+Follow the steps below one by one in a code notebook, or run the full example script 
+as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py`
 
 ## Step 1: Select a model, dataset, and recipe
 In this step, we select which model to use as a baseline for sparsification, a dataset to
@@ -36,7 +37,8 @@ recipe = "2:4_w4a16_recipe.yaml"
 
 ## Step 2: Run sparsification using `apply`
 The `apply` function applies the given recipe to our model and dataset.
-The hardcoded kwargs may be altered based on each model's needs.
+The hardcoded kwargs may be altered based on each model's needs. This code snippet should 
+be run in the same Python instance as step 1.
 After running, the sparsified model will be saved to `output_llama7b_2:4_w4a16_channel`.
 
 ```python
@@ -67,7 +69,7 @@ apply(
 ### Step 3: Compression
 
 The resulting model will be uncompressed. To save a final compressed copy of the model 
-run the following:
+run the following in the same Python instance as the previous steps.
 
 ```python
 import torch
diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py
index c894613ffbb..702218f7db7 100644
--- a/examples/llama7b_w8a8_quantization.py
+++ b/examples/llama7b_w8a8_quantization.py
@@ -16,12 +16,12 @@
                         num_bits: 8
                         type: "int"
                         symmetric: true
-                        strategy: "channel"
+                        strategy: "tensor"
                     input_activations:
                         num_bits: 8
                         type: "int"
                         symmetric: true
-                        dynamic: True
+                        dynamic: true
                         strategy: "token"
                     targets: ["Linear"]
 """
@@ -37,7 +37,7 @@
 dataset = "ultrachat-200k"
 
 # save location of quantized model out
-output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed"
+output_dir = "./output_llama7b_w8a8_dynamic_compressed"
 
 # set dataset config parameters
 splits = {"calibration": "train_gen[:5%]"}

From 9a28cd76d0bf816bfae51ccaa1f9862aff4c40af Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:48:26 -0400
Subject: [PATCH 15/19] update ignore list in llama examples (#2338)

---
 examples/llama7b_one_shot_quantization.md                      | 3 ++-
 .../llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml   | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/llama7b_one_shot_quantization.md b/examples/llama7b_one_shot_quantization.md
index d3ee50e1aaf..af644897065 100644
--- a/examples/llama7b_one_shot_quantization.md
+++ b/examples/llama7b_one_shot_quantization.md
@@ -23,7 +23,8 @@ from sparseml.modifiers.quantization.gptq import GPTQModifier
 
 gptq = GPTQModifier(
     targets="Linear",
-    scheme="W4A16"
+    scheme="W4A16",
+    ignore=["lm_head"],
 )
 ```
 
diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml
index aeddebb8cb3..6f35f511396 100644
--- a/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml
+++ b/examples/llama7b_sparse_quantized/2:4_w4a16_group-128_recipe.yaml
@@ -23,6 +23,7 @@ quantization_stage:
   run_type: oneshot
   quantization_modifiers:
     GPTQModifier:
+      ignore: [ "lm_head" ]
       sequential_update: false
       config_groups:
         group_0:

From 52245b3faded4c31bf5d3f24526beb11d33355ae Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Wed, 19 Jun 2024 16:02:58 -0400
Subject: [PATCH 16/19] suppress pydantic warning in legacy quantization
 modifier (#2340)

---
 src/sparseml/modifiers/quantization_legacy/base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/sparseml/modifiers/quantization_legacy/base.py b/src/sparseml/modifiers/quantization_legacy/base.py
index 9b9f1569f09..27856774c00 100644
--- a/src/sparseml/modifiers/quantization_legacy/base.py
+++ b/src/sparseml/modifiers/quantization_legacy/base.py
@@ -14,6 +14,8 @@
 
 from typing import Any, Dict, List, Optional
 
+from pydantic import ConfigDict
+
 from sparseml.core import Event, Modifier
 
 
@@ -81,6 +83,8 @@ class LegacyQuantizationModifier(Modifier):
     post_oneshot_calibration: Optional[bool] = False
     strict: bool = True
 
+    model_config = ConfigDict(protected_namespaces=())
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         if self.model_fuse_fn_kwargs is None:

From c4ec6319c17c728c55fe0dec12ee1d17bd72c325 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 20 Jun 2024 10:44:36 -0400
Subject: [PATCH 17/19] fix uncompressed path (#2339)

---
 examples/llama7b_sparse_quantized/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md
index f10bb0984ab..1a48c01afc6 100644
--- a/examples/llama7b_sparse_quantized/README.md
+++ b/examples/llama7b_sparse_quantized/README.md
@@ -73,10 +73,12 @@ run the following in the same Python instance as the previous steps.
 
 ```python
 import torch
+import os
 from sparseml.transformers import SparseAutoModelForCausalLM
 
 compressed_output_dir = "output_llama7b_2:4_w4a16_channel_compressed"
-model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
+uncompressed_path = os.path.join(output_dir, "stage_quantization")
+model = SparseAutoModelForCausalLM.from_pretrained(uncompressed_path, torch_dtype=torch.bfloat16)
 model.save_pretrained(compressed_output_dir, save_compressed=True)
 ```
 

From 0becc723db5a69e2455330b194f459e6be3a5e90 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 20 Jun 2024 12:55:03 -0400
Subject: [PATCH 18/19] update README memory requirements (#2342)

---
 examples/llama7b_sparse_quantized/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama7b_sparse_quantized/README.md b/examples/llama7b_sparse_quantized/README.md
index 1a48c01afc6..35183345d9c 100644
--- a/examples/llama7b_sparse_quantized/README.md
+++ b/examples/llama7b_sparse_quantized/README.md
@@ -2,7 +2,7 @@
 
 This example uses SparseML and Compressed-Tensors to create a 2:4 sparse and quantized Llama2-7b model.
 The model is calibrated and trained with the ultachat200k dataset.
-At least 75GB of GPU memory is required to run this example.
+At least 85GB of GPU memory is required to run this example.
 
 Follow the steps below one by one in a code notebook, or run the full example script 
 as `python examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py`

From f0a369203c4f310f013c52952f533f8a89788464 Mon Sep 17 00:00:00 2001
From: Benjamin Fineran <bfineran@users.noreply.github.com>
Date: Thu, 20 Jun 2024 15:51:11 -0400
Subject: [PATCH 19/19] suppress pydantic warnings for model_ fields (#2344)

---
 src/sparseml/exporters/transforms/kv_cache/configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sparseml/exporters/transforms/kv_cache/configs.py b/src/sparseml/exporters/transforms/kv_cache/configs.py
index 686adf5c7d5..d0135a2c133 100644
--- a/src/sparseml/exporters/transforms/kv_cache/configs.py
+++ b/src/sparseml/exporters/transforms/kv_cache/configs.py
@@ -84,7 +84,7 @@ class KeyValueCacheConfig(BaseModel):
         "the kv cache. If this is not provided, no transpose will "
         "be applied.",
     )
-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=())
 
 
 OPT_CONFIG = KeyValueCacheConfig(