From 7bb3db3aefe496f8d9969e32b6c7b503b9863e62 Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Fri, 24 May 2024 10:23:41 -0400
Subject: [PATCH] [GPTQ UX] Add scheme arg with QuantizationScheme support
 (#2286)

* Update GHA file to install compressed-tensors from source

* Missed commit (#2300)

* Remove src from import

* Style

* Full Scheme support

* Add a small test for accepting full scheme
---
 .../modifiers/quantization/gptq/base.py       | 13 ++++
 .../transformers/gptq/test_oneshot.py         | 76 +++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 tests/sparseml/transformers/gptq/test_oneshot.py

diff --git a/src/sparseml/modifiers/quantization/gptq/base.py b/src/sparseml/modifiers/quantization/gptq/base.py
index b91fa2dad60..7f1ca823cc7 100644
--- a/src/sparseml/modifiers/quantization/gptq/base.py
+++ b/src/sparseml/modifiers/quantization/gptq/base.py
@@ -68,6 +68,10 @@ class GPTQModifier(Modifier):
         not be updated. Leave None to not disable observers during QAT. Default is None
     :param num_calibration_steps: Number of steps to run post training calibration for.
         When None, the entire calibration_dataloader is used
+    :param scheme: [Used, if a quantization modifier is not specified], the quantization
+        scheme to apply to the model, this is a dictionary that supports all keys from
+        QuantizationScheme except targets, which will be set to the targets parameter
+        set at the modifier level.
     """
 
     sequential_update: Optional[bool] = False
@@ -79,6 +83,7 @@ class GPTQModifier(Modifier):
     ignore: List[str] = Field(default_factory=list)
     disable_quantization_observer_epoch: Optional[float] = None
     num_calibration_steps: Optional[int] = None
+    scheme: Optional[Dict[str, Any]] = None
     compressible_layers_: Optional[List] = None
     quantization_modifier_: Any = None
 
@@ -156,6 +161,14 @@ def _build_quant_modifier(self, framework):
             if getattr(self, key, False)
         }
 
+        if self.scheme is not None:
+            # takes precedence over config_groups
+            targets = self.targets or ["Linear"]
+            config_group = QuantizationScheme.model_validate(
+                {"targets": targets, **self.scheme}
+            )
+            quant_args["config_groups"] = {"config_group_0": config_group}
+
         if "config_groups" not in quant_args:
             default_quant_scheme = QuantizationScheme.default_scheme(
                 targets=self.targets
diff --git a/tests/sparseml/transformers/gptq/test_oneshot.py b/tests/sparseml/transformers/gptq/test_oneshot.py
new file mode 100644
index 00000000000..c7c14275df1
--- /dev/null
+++ b/tests/sparseml/transformers/gptq/test_oneshot.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import shutil
+import unittest
+
+from sparseml.transformers.sparsification.sparse_model import SparseAutoModelForCausalLM
+from tests.testing_utils import requires_torch
+
+
+@requires_torch
+class TestGPTQOneShotWithFullScheme(unittest.TestCase):
+    def setUp(self):
+        import torch
+
+        self.output = "./oneshot_output"
+        self.model = "roneneldan/TinyStories-1M"
+        self.dataset = "open_platypus"
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+        self.recipe = """
+        first_stage:
+            quant_modifiers:
+                    GPTQModifier:
+                        ignore: ["lm_head"]
+                        sequential_update: True
+                        dampening_frac: 0.001
+                        block_size: 128
+                        targets: ["Linear"]
+                        scheme:
+                            input_activations: null
+                            output_activations: null
+                            weights:
+                                num_bits: 8
+                                type: "int"
+                                symmetric: true
+                                strategy: "tensor"
+                                group_size: 128
+        """
+
+    def test_oneshot_application(self):
+        from sparseml.transformers import oneshot
+
+        oneshot(
+            model=self.model,
+            dataset=self.dataset,
+            output_dir=self.output,
+            overwrite_output_dir=True,
+            recipe=self.recipe,
+            oneshot_device=self.device,
+            num_calibration_samples=9,
+        )
+
+        model_loaded = SparseAutoModelForCausalLM.from_pretrained(self.output)
+
+        # Check that the model is quantized
+        assert model_loaded.quantization_config is not None
+
+        # Check a specific layer is quantized
+        targetted_linear_layer = model_loaded.transformer.h[0].attn.attention.k_proj
+        assert hasattr(targetted_linear_layer, "quantization_scheme")
+
+    def tearDown(self):
+        shutil.rmtree(self.output)