From 2bd35e539c9417b09da80fa7878e49e7db58ad59 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 23 May 2024 15:38:45 +0000
Subject: [PATCH 1/8] update examples to use new ux

---
 .../llama7b_sparse_quantized/2:4_w4a16_recipe.yaml | 10 +++-------
 examples/llama7b_w4a16_quantization.ipynb          | 14 +++++---------
 examples/llama7b_w4a16_quantization.py             |  7 ++-----
 examples/llama7b_w8a8_quantization.py              |  9 +++------
 4 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml
index 9969e5d77ce..aeddebb8cb3 100644
--- a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml
+++ b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml
@@ -22,8 +22,8 @@ finetuning_stage:
 quantization_stage:
   run_type: oneshot
   quantization_modifiers:
-    vLLMQuantizationModifier:
-      ignore: ["lm_head"]
+    GPTQModifier:
+      sequential_update: false
       config_groups:
         group_0:
           weights:
@@ -31,8 +31,4 @@ quantization_stage:
             type: "int"
             symmetric: true
             strategy: "channel"
-          targets: ["Linear"]
-    SparseGPTModifier:
-      sparsity: 0.0
-      quantize: True
-      sequential_update: false
\ No newline at end of file
+          targets: ["Linear"]
\ No newline at end of file
diff --git a/examples/llama7b_w4a16_quantization.ipynb b/examples/llama7b_w4a16_quantization.ipynb
index ad1ee7af8ce..194215891fa 100644
--- a/examples/llama7b_w4a16_quantization.ipynb
+++ b/examples/llama7b_w4a16_quantization.ipynb
@@ -25,10 +25,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "SparseML uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. Below we create a sample recipe for GPTQ quantization. The recipe is made up of two different algorithms, called modifiers.\n",
+    "SparseML uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. A recipe consists of one or more sparsification or quantization algorithms, called modifiers in SparseML. Below we create a sample recipe for GPTQ quantization that only requires a single modifier.\n",
     "\n",
-    "1. **vLLMQuantizationModifier**: calibrates the model for quantization by calculating scale and zero points from a small amount of calibration data\n",
-    "2. **SparseGPTModifier**: applies the GPTQ algorithm, using the result of the vLLMQuantizationModifier to determine the best quantization bin to place each linear weight into"
+    "This modifier specifies that we should quantize the weights of each linear layer to 4 bits, using a symmetric channelwise quantization pattern. The lm-head will not be quantized even though it is a Linear layer, because it is included in the ignore list."
    ]
   },
   {
@@ -37,10 +36,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "recipe=\"\"\"\n",
+    "recipe = \"\"\"\n",
     "quant_stage:\n",
     "    quant_modifiers:\n",
-    "        vLLMQuantizationModifier:\n",
+    "        GPTQModifier:\n",
+    "            sequential_update: false\n",
     "            ignore: [\"lm_head\"]\n",
     "            config_groups:\n",
     "                group_0:\n",
@@ -50,10 +50,6 @@
     "                        symmetric: true\n",
     "                        strategy: \"channel\"\n",
     "                    targets: [\"Linear\"]\n",
-    "        SparseGPTModifier:\n",
-    "            sparsity: 0.0\n",
-    "            quantize: True\n",
-    "            sequential_update: false\n",
     "\"\"\""
    ]
   },
diff --git a/examples/llama7b_w4a16_quantization.py b/examples/llama7b_w4a16_quantization.py
index 5aabf496436..a4a5f6bbb53 100644
--- a/examples/llama7b_w4a16_quantization.py
+++ b/examples/llama7b_w4a16_quantization.py
@@ -7,7 +7,8 @@
 recipe = """
 quant_stage:
     quant_modifiers:
-        vLLMQuantizationModifier:
+        GPTQModifier:
+            sequential_update: false
             ignore: ["lm_head"]
             config_groups:
                 group_0:
@@ -17,10 +18,6 @@
                         symmetric: true
                         strategy: "channel"
                     targets: ["Linear"]
-        SparseGPTModifier:
-            sparsity: 0.0
-            quantize: true
-            sequential_update: false
 """
 
 # setting device_map to auto to spread the model evenly across all available GPUs
diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py
index 5f70a2f1ae7..c894613ffbb 100644
--- a/examples/llama7b_w8a8_quantization.py
+++ b/examples/llama7b_w8a8_quantization.py
@@ -7,7 +7,8 @@
 recipe = """
 quant_stage:
     quant_modifiers:
-        vLLMQuantizationModifier:
+        GPTQModifier:
+            sequential_update: false
             ignore: ["lm_head"]
             config_groups:
                 group_0:
@@ -23,10 +24,6 @@
                         dynamic: True
                         strategy: "token"
                     targets: ["Linear"]
-        SparseGPTModifier:
-            sparsity: 0.0
-            quantize: true
-            sequential_update: false
 """
 
 # setting device_map to auto to spread the model evenly across all available GPUs
@@ -40,7 +37,7 @@
 dataset = "ultrachat-200k"
 
 # save location of quantized model out
-output_dir = "./output_llama7b_w8a8_channel_compressed"
+output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed"
 
 # set dataset config parameters
 splits = {"calibration": "train_gen[:5%]"}

From d4ea973d75c4e8ed3c851d814cfe3173037981ff Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 23 May 2024 20:32:22 +0000
Subject: [PATCH 2/8] add sparse model

---
 examples/llama7b_quantize_sparse_cnn.py | 74 +++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 examples/llama7b_quantize_sparse_cnn.py

diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py
new file mode 100644
index 00000000000..23316976df5
--- /dev/null
+++ b/examples/llama7b_quantize_sparse_cnn.py
@@ -0,0 +1,74 @@
+import torch
+from datasets import load_dataset
+
+from sparseml.transformers import (
+    SparseAutoModelForCausalLM,
+    SparseAutoTokenizer,
+    oneshot,
+)
+
+
+# define a sparseml recipe for GPTQ W4A16 quantization
+recipe = """
+quant_stage:
+    quant_modifiers:
+        GPTQModifier:
+            sequential_update: false
+            ignore: ["lm_head"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 4
+                        type: "int"
+                        symmetric: true
+                        strategy: "channel"
+                    targets: ["Linear"]
+"""
+
+# load in a 50% sparse model with 2:4 sparsity structure
+# setting device_map to auto to spread the model evenly across all available GPUs
+model_stub = "neuralmagic/SparseLlama-2-7b-cnn-daily-mail-pruned_50.2of4"
+model = SparseAutoModelForCausalLM.from_pretrained(
+    model_stub, torch_dtype=torch.bfloat16, device_map="auto"
+)
+tokenizer = SparseAutoTokenizer.from_pretrained(model_stub)
+
+# for quantization calibration, we will use a subset of the dataset that was used to
+# sparsity and finetune the model
+dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]")
+
+# set dataset config parameters
+max_seq_length = 1024
+pad_to_max_length = False
+num_calibration_samples = 512
+
+
+# preprocess the data into a single text entry, then tokenize the dataset
+def process_sample(sample):
+    formatted = "Article:\n{}\n\n### Summarization:\n{}".format(
+        sample["article"], sample["highlights"]
+    )
+    return tokenizer(
+        formatted, padding=pad_to_max_length, max_length=max_seq_length, truncation=True
+    )
+
+
+tokenized_dataset = dataset.map(
+    process_sample, remove_columns=["article", "highlights", "id"]
+)
+
+# save location of quantized model out
+output_dir = "/network/sadkins/llama7b_sparse_24_w4a16_channel_compressed"
+
+# apply quantization recipe to the model and save quantized output int4 packed format
+# the sparsity structure of the original model will be maintained
+oneshot(
+    model=model,
+    dataset=tokenized_dataset,
+    recipe=recipe,
+    output_dir=output_dir,
+    max_seq_length=max_seq_length,
+    pad_to_max_length=pad_to_max_length,
+    num_calibration_samples=num_calibration_samples,
+    save_compressed=True,
+)

From e4be56844770547e9f3831661f0eda91a9a7ee96 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 23 May 2024 20:40:39 +0000
Subject: [PATCH 3/8] update paths

---
 examples/llama7b_quantize_sparse_cnn.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py
index 23316976df5..263eebb8ca1 100644
--- a/examples/llama7b_quantize_sparse_cnn.py
+++ b/examples/llama7b_quantize_sparse_cnn.py
@@ -18,7 +18,7 @@
             config_groups:
                 group_0:
                     weights:
-                        num_bits: 4
+                        num_bits: 8
                         type: "int"
                         symmetric: true
                         strategy: "channel"
@@ -34,13 +34,13 @@
 tokenizer = SparseAutoTokenizer.from_pretrained(model_stub)
 
 # for quantization calibration, we will use a subset of the dataset that was used to
-# sparsity and finetune the model
+# sparsify and finetune the model
 dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]")
 
 # set dataset config parameters
-max_seq_length = 1024
+max_seq_length = 4096
 pad_to_max_length = False
-num_calibration_samples = 512
+num_calibration_samples = 1024
 
 
 # preprocess the data into a single text entry, then tokenize the dataset
@@ -58,7 +58,7 @@ def process_sample(sample):
 )
 
 # save location of quantized model out
-output_dir = "/network/sadkins/llama7b_sparse_24_w4a16_channel_compressed"
+output_dir = "./llama7b_sparse_24_w4a16_channel_compressed"
 
 # apply quantization recipe to the model and save quantized output int4 packed format
 # the sparsity structure of the original model will be maintained

From 07f1ddd2dcb79e2626a9900850ee4ce01f11557f Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 23 May 2024 20:41:00 +0000
Subject: [PATCH 4/8] update paths

---
 examples/llama7b_quantize_sparse_cnn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py
index 263eebb8ca1..b05e4965e1b 100644
--- a/examples/llama7b_quantize_sparse_cnn.py
+++ b/examples/llama7b_quantize_sparse_cnn.py
@@ -18,7 +18,7 @@
             config_groups:
                 group_0:
                     weights:
-                        num_bits: 8
+                        num_bits: 4
                         type: "int"
                         symmetric: true
                         strategy: "channel"
@@ -38,9 +38,9 @@
 dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]")
 
 # set dataset config parameters
-max_seq_length = 4096
+max_seq_length = 512
 pad_to_max_length = False
-num_calibration_samples = 1024
+num_calibration_samples = 512
 
 
 # preprocess the data into a single text entry, then tokenize the dataset

From dd83a54005fb8fb3fed28023aab70666c03e8441 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Thu, 23 May 2024 21:33:55 +0000
Subject: [PATCH 5/8] up samples for sparse model

---
 examples/llama7b_quantize_sparse_cnn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py
index b05e4965e1b..2ce3fb0ac6a 100644
--- a/examples/llama7b_quantize_sparse_cnn.py
+++ b/examples/llama7b_quantize_sparse_cnn.py
@@ -38,9 +38,9 @@
 dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]")
 
 # set dataset config parameters
-max_seq_length = 512
+max_seq_length = 4096
 pad_to_max_length = False
-num_calibration_samples = 512
+num_calibration_samples = 1024
 
 
 # preprocess the data into a single text entry, then tokenize the dataset

From 421ff88f2c7aea0811557e5b5e9f8dac9bb18439 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 28 May 2024 13:33:46 +0000
Subject: [PATCH 6/8] fix recipe

---
 .../llama7b_sparse_quantized/2:4_w4a16_recipe.yaml |   3 ++-
 ...workstation-deployment-57c9d55774-9p6vq.47256.0 | Bin 0 -> 40 bytes
 ...workstation-deployment-57c9d55774-9p6vq.47256.1 | Bin 0 -> 40 bytes
 ...workstation-deployment-57c9d55774-9p6vq.47256.2 | Bin 0 -> 40 bytes
 4 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0
 create mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1
 create mode 100644 examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2

diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml
index aeddebb8cb3..1c4d2a09802 100644
--- a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml
+++ b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml
@@ -24,6 +24,7 @@ quantization_stage:
   quantization_modifiers:
     GPTQModifier:
       sequential_update: false
+      ignore: ["lm_head"]
       config_groups:
         group_0:
           weights:
@@ -31,4 +32,4 @@ quantization_stage:
             type: "int"
             symmetric: true
             strategy: "channel"
-          targets: ["Linear"]
\ No newline at end of file
+          targets: ["Linear"]
diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0
new file mode 100644
index 0000000000000000000000000000000000000000..f50c1e1d35d72d428ca5c9089328bd1502ba3709
GIT binary patch
literal 40
rcmb1OfPlsI-b$Po)~jr}JL#sQ6mL>dVrHJ6YguYuiIvg5{v~_>=3Nb6

literal 0
HcmV?d00001

diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1
new file mode 100644
index 0000000000000000000000000000000000000000..e0a460d3b90a39b1e3c23fdfd23720d03f728d7a
GIT binary patch
literal 40
rcmb1OfPlsI-b$Q5E~sp|JL#sQ6mL>dVrHJ6YguYuiIvgUz^YCF?l=wd

literal 0
HcmV?d00001

diff --git a/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 b/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2
new file mode 100644
index 0000000000000000000000000000000000000000..01b3f2ebae03ef3ab2d990f314dbaae42834885c
GIT binary patch
literal 40
rcmb1OfPlsI-b$QH1Mh6PJL#sQ6mL>dVrHJ6YguYuiItK1b-VWf=+O<j

literal 0
HcmV?d00001


From e26e0a81ed0fa3b7349dde22e9e20a4dea4dc649 Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 28 May 2024 13:36:13 +0000
Subject: [PATCH 7/8] remove extra files

---
 ...workstation-deployment-57c9d55774-9p6vq.47256.0 | Bin 40 -> 0 bytes
 ...workstation-deployment-57c9d55774-9p6vq.47256.1 | Bin 40 -> 0 bytes
 ...workstation-deployment-57c9d55774-9p6vq.47256.2 | Bin 40 -> 0 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0
 delete mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1
 delete mode 100644 examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2

diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0
deleted file mode 100644
index f50c1e1d35d72d428ca5c9089328bd1502ba3709..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 40
rcmb1OfPlsI-b$Po)~jr}JL#sQ6mL>dVrHJ6YguYuiIvg5{v~_>=3Nb6

diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1
deleted file mode 100644
index e0a460d3b90a39b1e3c23fdfd23720d03f728d7a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 40
rcmb1OfPlsI-b$Q5E~sp|JL#sQ6mL>dVrHJ6YguYuiIvgUz^YCF?l=wd

diff --git a/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 b/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2
deleted file mode 100644
index 01b3f2ebae03ef3ab2d990f314dbaae42834885c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 40
rcmb1OfPlsI-b$QH1Mh6PJL#sQ6mL>dVrHJ6YguYuiItK1b-VWf=+O<j


From ec77568e8ba0be98fa1ac9368de0c5e3e073016a Mon Sep 17 00:00:00 2001
From: Sara Adkins <sara@neuralmagic.com>
Date: Tue, 28 May 2024 15:33:48 +0000
Subject: [PATCH 8/8] update sparse dtype

---
 examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py b/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py
index f70bf20a947..fe454a0d7ad 100644
--- a/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py
+++ b/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py
@@ -24,12 +24,12 @@
 num_calibration_samples = 512
 
 # set training parameters for finetuning
-num_train_epochs = 1
+num_train_epochs = 0.5
 logging_steps = 500
 save_steps = 5000
 gradient_checkpointing = True  # saves memory during training
 learning_rate = 0.0001
-bf16 = True  # using bfloat16 for training
+bf16 = False  # using full precision for training
 lr_scheduler_type = "cosine"
 warmup_ratio = 0.1