From 53e98b6a1eb42221a358e9a8019a745f66d565f1 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 28 May 2024 11:37:57 -0400 Subject: [PATCH 1/3] Fixing Multi-GPU Unit Test Issue (#2302) * set device at model init * quality --- .../finetune/test_oneshot_then_finetune.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/sparseml/transformers/finetune/test_oneshot_then_finetune.py b/tests/sparseml/transformers/finetune/test_oneshot_then_finetune.py index eb010083343..ef7a1b30a5b 100644 --- a/tests/sparseml/transformers/finetune/test_oneshot_then_finetune.py +++ b/tests/sparseml/transformers/finetune/test_oneshot_then_finetune.py @@ -34,16 +34,13 @@ def setUp(self): self.output = Path("./finetune_output") def test_oneshot_then_finetune(self): - import torch - import sparseml - from sparseml.transformers import oneshot, train + from sparseml.transformers import SparseAutoModelForCausalLM, oneshot, train recipe_str = "tests/sparseml/transformers/obcq/recipes/test_tiny2.yaml" - model = "Xenova/llama2.c-stories15M" - device = "cuda:0" - if not torch.cuda.is_available(): - device = "cpu" + model = SparseAutoModelForCausalLM.from_pretrained( + "Xenova/llama2.c-stories15M", device_map="auto" + ) dataset = "open_platypus" concatenate_data = False num_calibration_samples = 64 @@ -59,11 +56,15 @@ def test_oneshot_then_finetune(self): recipe=recipe_str, concatenate_data=concatenate_data, splits=splits, - oneshot_device=device, ) recipe_str = "tests/sparseml/transformers/finetune/test_finetune_recipe.yaml" - model = self.output / "oneshot_out" + model = SparseAutoModelForCausalLM.from_pretrained( + self.output / "oneshot_out", device_map="auto" + ) + distill_teacher = SparseAutoModelForCausalLM.from_pretrained( + "Xenova/llama2.c-stories15M", device_map="auto" + ) dataset = "open_platypus" concatenate_data = False output_dir = self.output / "finetune_out" @@ -73,7 +74,7 @@ def test_oneshot_then_finetune(self): with sparseml.create_session(): train( model=model, - distill_teacher="Xenova/llama2.c-stories15M", + distill_teacher=distill_teacher, dataset=dataset, output_dir=output_dir, num_calibration_samples=num_calibration_samples, @@ -81,7 +82,6 @@ def test_oneshot_then_finetune(self): concatenate_data=concatenate_data, splits=splits, max_steps=max_steps, - oneshot_device=device, ) def tearDown(self): From 56b785414a4609a2827395c017c134cf81f14d95 Mon Sep 17 00:00:00 2001 From: Sara Adkins Date: Tue, 28 May 2024 12:25:05 -0400 Subject: [PATCH 2/3] Update Examples to New UX (#2301) * update examples to use new ux * add sparse model * update paths * update paths * up samples for sparse model * fix recipe * remove extra files * update sparse dtype --------- Co-authored-by: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> --- examples/llama7b_quantize_sparse_cnn.py | 74 +++++++++++++++++++ .../2:4_w4a16_recipe.yaml | 7 +- .../llama7b_sparse_w4a16.py | 4 +- examples/llama7b_w4a16_quantization.ipynb | 14 ++-- examples/llama7b_w4a16_quantization.py | 7 +- examples/llama7b_w8a8_quantization.py | 9 +-- 6 files changed, 88 insertions(+), 27 deletions(-) create mode 100644 examples/llama7b_quantize_sparse_cnn.py diff --git a/examples/llama7b_quantize_sparse_cnn.py b/examples/llama7b_quantize_sparse_cnn.py new file mode 100644 index 00000000000..2ce3fb0ac6a --- /dev/null +++ b/examples/llama7b_quantize_sparse_cnn.py @@ -0,0 +1,74 @@ +import torch +from datasets import load_dataset + +from sparseml.transformers import ( + SparseAutoModelForCausalLM, + SparseAutoTokenizer, + oneshot, +) + + +# define a sparseml recipe for GPTQ W4A16 quantization +recipe = """ +quant_stage: + quant_modifiers: + GPTQModifier: + sequential_update: false + ignore: ["lm_head"] + config_groups: + group_0: + weights: + num_bits: 4 + type: "int" + symmetric: true + strategy: "channel" + targets: ["Linear"] +""" + +# load in a 50% sparse model with 2:4 sparsity structure +# setting device_map to auto to spread the model evenly across all available GPUs +model_stub = "neuralmagic/SparseLlama-2-7b-cnn-daily-mail-pruned_50.2of4" +model = SparseAutoModelForCausalLM.from_pretrained( + model_stub, torch_dtype=torch.bfloat16, device_map="auto" +) +tokenizer = SparseAutoTokenizer.from_pretrained(model_stub) + +# for quantization calibration, we will use a subset of the dataset that was used to +# sparsify and finetune the model +dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]") + +# set dataset config parameters +max_seq_length = 4096 +pad_to_max_length = False +num_calibration_samples = 1024 + + +# preprocess the data into a single text entry, then tokenize the dataset +def process_sample(sample): + formatted = "Article:\n{}\n\n### Summarization:\n{}".format( + sample["article"], sample["highlights"] + ) + return tokenizer( + formatted, padding=pad_to_max_length, max_length=max_seq_length, truncation=True + ) + + +tokenized_dataset = dataset.map( + process_sample, remove_columns=["article", "highlights", "id"] +) + +# save location of quantized model out +output_dir = "./llama7b_sparse_24_w4a16_channel_compressed" + +# apply quantization recipe to the model and save quantized output int4 packed format +# the sparsity structure of the original model will be maintained +oneshot( + model=model, + dataset=tokenized_dataset, + recipe=recipe, + output_dir=output_dir, + max_seq_length=max_seq_length, + pad_to_max_length=pad_to_max_length, + num_calibration_samples=num_calibration_samples, + save_compressed=True, +) diff --git a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml index 9969e5d77ce..1c4d2a09802 100644 --- a/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml +++ b/examples/llama7b_sparse_quantized/2:4_w4a16_recipe.yaml @@ -22,7 +22,8 @@ finetuning_stage: quantization_stage: run_type: oneshot quantization_modifiers: - vLLMQuantizationModifier: + GPTQModifier: + sequential_update: false ignore: ["lm_head"] config_groups: group_0: @@ -32,7 +33,3 @@ quantization_stage: symmetric: true strategy: "channel" targets: ["Linear"] - SparseGPTModifier: - sparsity: 0.0 - quantize: True - sequential_update: false \ No newline at end of file diff --git a/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py b/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py index f70bf20a947..fe454a0d7ad 100644 --- a/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py +++ b/examples/llama7b_sparse_quantized/llama7b_sparse_w4a16.py @@ -24,12 +24,12 @@ num_calibration_samples = 512 # set training parameters for finetuning -num_train_epochs = 1 +num_train_epochs = 0.5 logging_steps = 500 save_steps = 5000 gradient_checkpointing = True # saves memory during training learning_rate = 0.0001 -bf16 = True # using bfloat16 for training +bf16 = False # using full precision for training lr_scheduler_type = "cosine" warmup_ratio = 0.1 diff --git a/examples/llama7b_w4a16_quantization.ipynb b/examples/llama7b_w4a16_quantization.ipynb index ad1ee7af8ce..194215891fa 100644 --- a/examples/llama7b_w4a16_quantization.ipynb +++ b/examples/llama7b_w4a16_quantization.ipynb @@ -25,10 +25,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "SparseML uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. Below we create a sample recipe for GPTQ quantization. The recipe is made up of two different algorithms, called modifiers.\n", + "SparseML uses recipes to define configurations for different oneshot algorithms. Recipes can be defined as a string or a yaml file. A recipe consists of one or more sparsification or quantization algorithms, called modifiers in SparseML. Below we create a sample recipe for GPTQ quantization that only requires a single modifier.\n", "\n", - "1. **vLLMQuantizationModifier**: calibrates the model for quantization by calculating scale and zero points from a small amount of calibration data\n", - "2. **SparseGPTModifier**: applies the GPTQ algorithm, using the result of the vLLMQuantizationModifier to determine the best quantization bin to place each linear weight into" + "This modifier specifies that we should quantize the weights of each linear layer to 4 bits, using a symmetric channelwise quantization pattern. The lm-head will not be quantized even though it is a Linear layer, because it is included in the ignore list." ] }, { @@ -37,10 +36,11 @@ "metadata": {}, "outputs": [], "source": [ - "recipe=\"\"\"\n", + "recipe = \"\"\"\n", "quant_stage:\n", " quant_modifiers:\n", - " vLLMQuantizationModifier:\n", + " GPTQModifier:\n", + " sequential_update: false\n", " ignore: [\"lm_head\"]\n", " config_groups:\n", " group_0:\n", @@ -50,10 +50,6 @@ " symmetric: true\n", " strategy: \"channel\"\n", " targets: [\"Linear\"]\n", - " SparseGPTModifier:\n", - " sparsity: 0.0\n", - " quantize: True\n", - " sequential_update: false\n", "\"\"\"" ] }, diff --git a/examples/llama7b_w4a16_quantization.py b/examples/llama7b_w4a16_quantization.py index 5aabf496436..a4a5f6bbb53 100644 --- a/examples/llama7b_w4a16_quantization.py +++ b/examples/llama7b_w4a16_quantization.py @@ -7,7 +7,8 @@ recipe = """ quant_stage: quant_modifiers: - vLLMQuantizationModifier: + GPTQModifier: + sequential_update: false ignore: ["lm_head"] config_groups: group_0: @@ -17,10 +18,6 @@ symmetric: true strategy: "channel" targets: ["Linear"] - SparseGPTModifier: - sparsity: 0.0 - quantize: true - sequential_update: false """ # setting device_map to auto to spread the model evenly across all available GPUs diff --git a/examples/llama7b_w8a8_quantization.py b/examples/llama7b_w8a8_quantization.py index 5f70a2f1ae7..c894613ffbb 100644 --- a/examples/llama7b_w8a8_quantization.py +++ b/examples/llama7b_w8a8_quantization.py @@ -7,7 +7,8 @@ recipe = """ quant_stage: quant_modifiers: - vLLMQuantizationModifier: + GPTQModifier: + sequential_update: false ignore: ["lm_head"] config_groups: group_0: @@ -23,10 +24,6 @@ dynamic: True strategy: "token" targets: ["Linear"] - SparseGPTModifier: - sparsity: 0.0 - quantize: true - sequential_update: false """ # setting device_map to auto to spread the model evenly across all available GPUs @@ -40,7 +37,7 @@ dataset = "ultrachat-200k" # save location of quantized model out -output_dir = "./output_llama7b_w8a8_channel_compressed" +output_dir = "./output_llama7b_w8a8_channel_dynamic_compressed" # set dataset config parameters splits = {"calibration": "train_gen[:5%]"} From 451d838968dc8bf8f105deeec94d6d25f2801cfb Mon Sep 17 00:00:00 2001 From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com> Date: Tue, 28 May 2024 20:24:02 +0200 Subject: [PATCH 3/3] [Fix] Fully functional FSDP one-shot process (#2305) * Update tests; diff updated on compressed tensors side * Style * Initial commit * fix the FSDP name stripping * cleanup after rebase * refactoring --------- Co-authored-by: Rahul Tuli Co-authored-by: bogunowicz@arrival.com --- src/sparseml/modifiers/quantization/gptq/pytorch.py | 2 ++ src/sparseml/pytorch/utils/sparsification.py | 4 ++++ src/sparseml/utils/fsdp/context.py | 10 +++++++--- src/sparseml/utils/pytorch/module.py | 1 - 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/sparseml/modifiers/quantization/gptq/pytorch.py b/src/sparseml/modifiers/quantization/gptq/pytorch.py index 6f1c9f40bbd..2eb14e8d2d7 100644 --- a/src/sparseml/modifiers/quantization/gptq/pytorch.py +++ b/src/sparseml/modifiers/quantization/gptq/pytorch.py @@ -23,6 +23,7 @@ from sparseml.modifiers.quantization.gptq.utils.gptq_wrapper import GPTQWrapper from sparseml.modifiers.utils.layer_compressor import LayerCompressor from sparseml.modifiers.utils.pytorch_helpers import run_calibration_forward +from sparseml.utils.fsdp.context import fix_fsdp_module_name __all__ = ["GPTQModifierPyTorch"] @@ -116,6 +117,7 @@ def initialize_compression( self.layer_compressors_ = [] for idx, (name, layer) in enumerate(self.compressible_layers_.items()): + name = fix_fsdp_module_name(name) _LOGGER.info(f"Preparing {name} for compression") args = self._pruning_arguments() comp_cls = self._compression_class() diff --git a/src/sparseml/pytorch/utils/sparsification.py b/src/sparseml/pytorch/utils/sparsification.py index f22750c85c6..9542c730a0b 100644 --- a/src/sparseml/pytorch/utils/sparsification.py +++ b/src/sparseml/pytorch/utils/sparsification.py @@ -69,6 +69,10 @@ def __init__( self.state_dict = state_dict if self.state_dict is not None: + # when analyzing an FSDP model, the state_dict does not differentiate + # between trainable and non-trainable parameters + # (e.g. it can contain buffers) this means that the + # self.trainable_parameters may be overestimated self.trainable_params = [param for _, param in state_dict.items()] else: self.trainable_params = list( diff --git a/src/sparseml/utils/fsdp/context.py b/src/sparseml/utils/fsdp/context.py index d6a3063f05c..6d9470e20a2 100644 --- a/src/sparseml/utils/fsdp/context.py +++ b/src/sparseml/utils/fsdp/context.py @@ -30,7 +30,7 @@ "fix_fsdp_module_name", ] -FSDP_WRAPPER_NAME = "_fsdp_wrapped_module." +FSDP_WRAPPER_NAME = "_fsdp_wrapped_module" def summon_full_params_context(model, offload_to_cpu: bool = False): @@ -61,9 +61,13 @@ def main_process_first_context(): def fix_fsdp_module_name(name: str) -> str: """ - Remove FSDP wrapper prefixes from a module name + Remove FSDP wrapper prefixes from a module name. + Accounts for scenario where FSDP_WRAPPER_NAME is + at the end of the name, as well as in the middle. :param name: name to strip :return: stripped name """ - return name.replace(FSDP_WRAPPER_NAME, "") + return name.replace(FSDP_WRAPPER_NAME + ".", "").replace( + "." + FSDP_WRAPPER_NAME, "" + ) diff --git a/src/sparseml/utils/pytorch/module.py b/src/sparseml/utils/pytorch/module.py index 780f1255db1..437a2e723f2 100644 --- a/src/sparseml/utils/pytorch/module.py +++ b/src/sparseml/utils/pytorch/module.py @@ -188,7 +188,6 @@ def get_layer(target: str, module: Module) -> Tuple[str, Module]: def set_layer(target: str, layer: Module, module: Module) -> Module: - target = fix_fsdp_module_name(target) with summon_full_params_context(module): # importing here to avoid circular import from sparseml.utils.fsdp.helpers import maybe_get_wrapped