From 2bd35e539c9417b09da80fa7878e49e7db58ad59 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 23 May 2024 15:38:45 +0000 Subject: [PATCH 1/8] update examples to use new ux --- .../llama7b_sparse_quantized/2:4_w4a16_recipe.yaml | 10 +++------- examples/llama7b_w4a16_quantization.ipynb | 14 +++++--------- examples/llama7b_w4a16_quantization.py | 7 ++----- examples/llama7b_w8a8_quantization.py | 9 +++------ 4 files changed, 13 insertions(+), 27 deletions(-) diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml index 9969e5d77ce..aeddebb8cb3 100644 --- a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml +++ b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml @@ -22,8 +22,8 @@ finetuning_stage: quantization_stage: run_type: oneshot quantization_modifiers: - vLLMQuantizationModifier: - ignore: ["lm_head"] + GPTQModifier: + sequential_update: false config_groups: group_0: weights: @@ -31,8 +31,4 @@ quantization_stage: type: "int" symmetric: true strategy: "channel" - targets: ["Linear"] - SparseGPTModifier: - sparsity: 0.0 - quantize: True - sequential_update: false \ No newline at end of file + targets: ["Linear"] \ No newline at end of file diff --git a/examples/llama7b_w4a16_quantization.ipynb b/examples/llama7b_w4a16_quantization.ipynb index ad1ee7af8ce..194215891fa 100644 --- a/examples/llama7b_w4a16_quantization.ipynb +++ b/examples/llama7b_w4a16_quantization.ipynb @@ -25,10 +25,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "SparseML uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. Below we create a sample recipe for GPTQ quantization. The recipe is made up of two different algorithms, called modifiers.\n", + "SparseML uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. A recipe consists of one or more sparsification or quantization algorithms, called modifiers in SparseML. Below we create a sample recipe for GPTQ quantization that only requires a single modifier.\n", "\n", - "1. **vLLMQuantizationModifier**: calibrates the model for quantization by calculating scale and zero points from a small amount of calibration data\n", - "2. **SparseGPTModifier**: applies the GPTQ algorithm, using the result of the vLLMQuantizationModifier to determine the best quantization bin to place each linear weight into" + "This modifier specifies that we should quantize the weights of each linear layer to 4 bits, using a symmetric channelwise quantization pattern. The lm-head will not be quantized even though it is a Linear layer, because it is included in the ignore list." ] }, { @@ -37,10 +36,11 @@ "metadata": {}, "outputs": [], "source": [ - "recipe=\"\"\"\n", + "recipe = \"\"\"\n", "quant_stage:\n", " quant_modifiers:\n", - " vLLMQuantizationModifier:\n", + " GPTQModifier:\n", + " sequential_update: false\n", " ignore: [\"lm_head\"]\n", " config_groups:\n", " group_0:\n", @@ -50,10 +50,6 @@ " symmetric: true\n", " strategy: \"channel\"\n", " targets: [\"Linear\"]\n", - " SparseGPTModifier:\n", - " sparsity: 0.0\n", - " quantize: True\n", - " sequential_update: false\n", "\"\"\"" ] }, diff --git a/examples/llama7b_w4a16_quantization.py b/examples/llama7b_w4a16_quantization.py index 5aabf496436..a4a5f6bbb53 100644 --- a/examples/llama7b_w4a16_quantization.py +++ b/examples/llama7b_w4a16_quantization.py @@ -7,7 +7,8 @@ recipe = """ quant_stage: quant_modifiers: - vLLMQuantizationModifier: + GPTQModifier: + sequential_update: false ignore: ["lm_head"] config_groups: group_0: @@ -17,10 +18,6 @@ symmetric: true strategy: "channel" targets: ["Linear"] - SparseGPTModifier: - sparsity: 0.0 - quantize: true - sequential_update: false """ # setting device_map to auto to spread the model evenly across all available GPUs diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index 5f70a2f1ae7..c894613ffbb 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -7,7 +7,8 @@ recipe = """ quant_stage: quant_modifiers: - vLLMQuantizationModifier: + GPTQModifier: + sequential_update: false ignore: ["lm_head"] config_groups: group_0: @@ -23,10 +24,6 @@ dynamic: True strategy: "token" targets: ["Linear"] - SparseGPTModifier: - sparsity: 0.0 - quantize: true - sequential_update: false """ # setting device_map to auto to spread the model evenly across all available GPUs @@ -40,7 +37,7 @@ dataset = "ultrachat-200k" # save location of quantized model out -output_dir = "./output_llama7b_w8a8_channel_compressed" +output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed" # set dataset config parameters splits = {"calibration": "train_gen[:5%]"} From d4ea973d75c4e8ed3c851d814cfe3173037981ff Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 23 May 2024 20:32:22 +0000 Subject: [PATCH 2/8] add sparse model --- examples/llama7b_quantize_sparse_cnn.py | 74 +++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 examples/llama7b_quantize_sparse_cnn.py diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py new file mode 100644 index 00000000000..23316976df5 --- /dev/null +++ b/examples/llama7b_quantize_sparse_cnn.py @@ -0,0 +1,74 @@ +import torch +from datasets import load_dataset + +from sparseml.transformers import ( + SparseAutoModelForCausalLM, + SparseAutoTokenizer, + oneshot, +) + + +# define a sparseml recipe for GPTQ W4A16 quantization +recipe = """ +quant_stage: + quant_modifiers: + GPTQModifier: + sequential_update: false + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: "channel" + targets: ["Linear"] +""" + +# load in a 50% sparse model with 2:4 sparsity structure +# setting device_map to auto to spread the model evenly across all available GPUs +model_stub = "neuralmagic/SparseLlama-2-7b-cnn-daily-mail-pruned_50.2of4" +model = SparseAutoModelForCausalLM.from_pretrained( + model_stub, torch_dtype=torch.bfloat16, device_map="auto" +) +tokenizer = SparseAutoTokenizer.from_pretrained(model_stub) + +# for quantization calibration, we will use a subset of the dataset that was used to +# sparsity and finetune the model +dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]") + +# set dataset config parameters +max_seq_length = 1024 +pad_to_max_length = False +num_calibration_samples = 512 + + +# preprocess the data into a single text entry, then tokenize the dataset +def process_sample(sample): + formatted = "Article:\n{}\n\n### Summarization:\n{}".format( + sample["article"], sample["highlights"] + ) + return tokenizer( + formatted, padding=pad_to_max_length, max_length=max_seq_length, truncation=True + ) + + +tokenized_dataset = dataset.map( + process_sample, remove_columns=["article", "highlights", "id"] +) + +# save location of quantized model out +output_dir = "/network/sadkins/llama7b_sparse_24_w4a16_channel_compressed" + +# apply quantization recipe to the model and save quantized output int4 packed format +# the sparsity structure of the original model will be maintained +oneshot( + model=model, + dataset=tokenized_dataset, + recipe=recipe, + output_dir=output_dir, + max_seq_length=max_seq_length, + pad_to_max_length=pad_to_max_length, + num_calibration_samples=num_calibration_samples, + save_compressed=True, +) From e4be56844770547e9f3831661f0eda91a9a7ee96 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 23 May 2024 20:40:39 +0000 Subject: [PATCH 3/8] update paths --- examples/llama7b_quantize_sparse_cnn.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py index 23316976df5..263eebb8ca1 100644 --- a/examples/llama7b_quantize_sparse_cnn.py +++ b/examples/llama7b_quantize_sparse_cnn.py @@ -18,7 +18,7 @@ config_groups: group_0: weights: - num_bits: 4 + num_bits: 8 type: "int" symmetric: true strategy: "channel" @@ -34,13 +34,13 @@ tokenizer = SparseAutoTokenizer.from_pretrained(model_stub) # for quantization calibration, we will use a subset of the dataset that was used to -# sparsity and finetune the model +# sparsify and finetune the model dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]") # set dataset config parameters -max_seq_length = 1024 +max_seq_length = 4096 pad_to_max_length = False -num_calibration_samples = 512 +num_calibration_samples = 1024 # preprocess the data into a single text entry, then tokenize the dataset @@ -58,7 +58,7 @@ def process_sample(sample): ) # save location of quantized model out -output_dir = "/network/sadkins/llama7b_sparse_24_w4a16_channel_compressed" +output_dir = "./llama7b_sparse_24_w4a16_channel_compressed" # apply quantization recipe to the model and save quantized output int4 packed format # the sparsity structure of the original model will be maintained From 07f1ddd2dcb79e2626a9900850ee4ce01f11557f Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 23 May 2024 20:41:00 +0000 Subject: [PATCH 4/8] update paths --- examples/llama7b_quantize_sparse_cnn.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py index 263eebb8ca1..b05e4965e1b 100644 --- a/examples/llama7b_quantize_sparse_cnn.py +++ b/examples/llama7b_quantize_sparse_cnn.py @@ -18,7 +18,7 @@ config_groups: group_0: weights: - num_bits: 8 + num_bits: 4 type: "int" symmetric: true strategy: "channel" @@ -38,9 +38,9 @@ dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]") # set dataset config parameters -max_seq_length = 4096 +max_seq_length = 512 pad_to_max_length = False -num_calibration_samples = 1024 +num_calibration_samples = 512 # preprocess the data into a single text entry, then tokenize the dataset From dd83a54005fb8fb3fed28023aab70666c03e8441 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Thu, 23 May 2024 21:33:55 +0000 Subject: [PATCH 5/8] up samples for sparse model --- examples/llama7b_quantize_sparse_cnn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py index b05e4965e1b..2ce3fb0ac6a 100644 --- a/examples/llama7b_quantize_sparse_cnn.py +++ b/examples/llama7b_quantize_sparse_cnn.py @@ -38,9 +38,9 @@ dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]") # set dataset config parameters -max_seq_length = 512 +max_seq_length = 4096 pad_to_max_length = False -num_calibration_samples = 512 +num_calibration_samples = 1024 # preprocess the data into a single text entry, then tokenize the dataset From 421ff88f2c7aea0811557e5b5e9f8dac9bb18439 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 28 May 2024 13:33:46 +0000 Subject: [PATCH 6/8] fix recipe --- .../llama7b_sparse_quantized/2:4_w4a16_recipe.yaml | 3 ++- ...workstation-deployment-57c9d55774-9p6vq.47256.0 | Bin 0 -> 40 bytes ...workstation-deployment-57c9d55774-9p6vq.47256.1 | Bin 0 -> 40 bytes ...workstation-deployment-57c9d55774-9p6vq.47256.2 | Bin 0 -> 40 bytes 4 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 create mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 create mode 100644 examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml index aeddebb8cb3..1c4d2a09802 100644 --- a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml +++ b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml @@ -24,6 +24,7 @@ quantization_stage: quantization_modifiers: GPTQModifier: sequential_update: false + ignore: ["lm_head"] config_groups: group_0: weights: @@ -31,4 +32,4 @@ quantization_stage: type: "int" symmetric: true strategy: "channel" - targets: ["Linear"] \ No newline at end of file + targets: ["Linear"] diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 new file mode 100644 index 0000000000000000000000000000000000000000..f50c1e1d35d72d428ca5c9089328bd1502ba3709 GIT binary patch literal 40 rcmb1OfPlsI-b$Po)~jr}JL#sQ6mL>dVrHJ6YguYuiIvg5{v~_>=3Nb6 literal 0 HcmV?d00001 diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 new file mode 100644 index 0000000000000000000000000000000000000000..e0a460d3b90a39b1e3c23fdfd23720d03f728d7a GIT binary patch literal 40 rcmb1OfPlsI-b$Q5E~sp|JL#sQ6mL>dVrHJ6YguYuiIvgUz^YCF?l=wd literal 0 HcmV?d00001 diff --git a/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 b/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 new file mode 100644 index 0000000000000000000000000000000000000000..01b3f2ebae03ef3ab2d990f314dbaae42834885c GIT binary patch literal 40 rcmb1OfPlsI-b$QH1Mh6PJL#sQ6mL>dVrHJ6YguYuiItK1b-VWf=+O Date: Tue, 28 May 2024 13:36:13 +0000 Subject: [PATCH 7/8] remove extra files --- ...workstation-deployment-57c9d55774-9p6vq.47256.0 | Bin 40 -> 0 bytes ...workstation-deployment-57c9d55774-9p6vq.47256.1 | Bin 40 -> 0 bytes ...workstation-deployment-57c9d55774-9p6vq.47256.2 | Bin 40 -> 0 bytes 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 delete mode 100644 examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 delete mode 100644 examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.0 deleted file mode 100644 index f50c1e1d35d72d428ca5c9089328bd1502ba3709..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Po)~jr}JL#sQ6mL>dVrHJ6YguYuiIvg5{v~_>=3Nb6 diff --git a/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 b/examples/tensorboard/events.out.tfevents.1716221648.workstation-deployment-57c9d55774-9p6vq.47256.1 deleted file mode 100644 index e0a460d3b90a39b1e3c23fdfd23720d03f728d7a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$Q5E~sp|JL#sQ6mL>dVrHJ6YguYuiIvgUz^YCF?l=wd diff --git a/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 b/examples/tensorboard/events.out.tfevents.1716221651.workstation-deployment-57c9d55774-9p6vq.47256.2 deleted file mode 100644 index 01b3f2ebae03ef3ab2d990f314dbaae42834885c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 rcmb1OfPlsI-b$QH1Mh6PJL#sQ6mL>dVrHJ6YguYuiItK1b-VWf=+O Date: Tue, 28 May 2024 15:33:48 +0000 Subject: [PATCH 8/8] update sparse dtype --- examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py b/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py index f70bf20a947..fe454a0d7ad 100644 --- a/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py +++ b/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py @@ -24,12 +24,12 @@ num_calibration_samples = 512 # set training parameters for finetuning -num_train_epochs = 1 +num_train_epochs = 0.5 logging_steps = 500 save_steps = 5000 gradient_checkpointing = True # saves memory during training learning_rate = 0.0001 -bf16 = True # using bfloat16 for training +bf16 = False # using full precision for training lr_scheduler_type = "cosine" warmup_ratio = 0.1