diff --git a/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml b/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml index f49f56351d3..411d6a41fed 100644 --- a/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml +++ b/tests/sparseml/transformers/finetune/test_alternate_recipe.yaml @@ -6,8 +6,7 @@ test_oneshot_stage: sequential_update: False quantize: False percdamp: 0.01 - prunen: 0 - prunem: 0 + mask_structure: "0:0" targets: [ "model.layers.0" ] diff --git a/tests/sparseml/transformers/obcq/test_additional_sparsity.yaml b/tests/sparseml/transformers/obcq/additional_sparsity.yaml similarity index 89% rename from tests/sparseml/transformers/obcq/test_additional_sparsity.yaml rename to tests/sparseml/transformers/obcq/additional_sparsity.yaml index 4615625675f..19d479e8666 100644 --- a/tests/sparseml/transformers/obcq/test_additional_sparsity.yaml +++ b/tests/sparseml/transformers/obcq/additional_sparsity.yaml @@ -6,8 +6,7 @@ test_stage: sequential_update: True quantize: False percdamp: 0.01 - prunen: 0 - prunem: 0 + mask_structure: "0:0" targets: [ "model.layers.0" ] diff --git a/tests/sparseml/transformers/obcq/quant.yaml b/tests/sparseml/transformers/obcq/quant.yaml new file mode 100644 index 00000000000..d229cba2923 --- /dev/null +++ b/tests/sparseml/transformers/obcq/quant.yaml @@ -0,0 +1,40 @@ +test_stage: + obcq_modifiers: + SmoothQuantModifier: + smoothing_strength: 0.5 + mappings: [ + [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"], + [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"] + ] + QuantizationModifier: + ignore: + - LlamaRotaryEmbedding + - LlamaRMSNorm + - SiLU + - model.layers.0.mlp.down_proj + - model.layers.1.mlp.down_proj + - model.layers.2.mlp.down_proj + - model.layers.3.mlp.down_proj + - model.layers.4.mlp.down_proj + - model.layers.5.mlp.down_proj + scheme_overrides: + Embedding: + input_activations: null + weights: + num_bits: 8 + symmetric: False + SparseGPTModifier: + sparsity: 0.0 + block_size: 128 + sequential_update: False + quantize: True + percdamp: 0.01 + mask_structure: "0:0" + targets: [ + "model.layers.0", + "model.layers.1", + "model.layers.2", + "model.layers.3", + "model.layers.4", + "model.layers.5" + ] \ No newline at end of file diff --git a/tests/sparseml/transformers/obcq/test_tiny.yaml b/tests/sparseml/transformers/obcq/quant_and_sparse.yaml similarity index 97% rename from tests/sparseml/transformers/obcq/test_tiny.yaml rename to tests/sparseml/transformers/obcq/quant_and_sparse.yaml index 422baf87580..ddaf20b854f 100644 --- a/tests/sparseml/transformers/obcq/test_tiny.yaml +++ b/tests/sparseml/transformers/obcq/quant_and_sparse.yaml @@ -30,8 +30,7 @@ test_stage: sequential_update: False quantize: True percdamp: 0.01 - prunen: 0 - prunem: 0 + mask_structure: "0:0" targets: [ "model.layers.0", "model.layers.1", diff --git a/tests/sparseml/transformers/obcq/test_tiny_w_head.yaml b/tests/sparseml/transformers/obcq/sparse.yaml similarity index 89% rename from tests/sparseml/transformers/obcq/test_tiny_w_head.yaml rename to tests/sparseml/transformers/obcq/sparse.yaml index a5debe7e25e..3b03ff95f7e 100644 --- a/tests/sparseml/transformers/obcq/test_tiny_w_head.yaml +++ b/tests/sparseml/transformers/obcq/sparse.yaml @@ -6,8 +6,7 @@ test_stage: sequential_update: False quantize: False percdamp: 0.01 - prunen: 0 - prunem: 0 + mask_structure: "0:0" targets: [ "model.layers.0", "model.layers.1", diff --git a/tests/sparseml/transformers/obcq/test_obcq.py b/tests/sparseml/transformers/obcq/test_obcq.py index 12cbb5e6240..f61ac1c2567 100644 --- a/tests/sparseml/transformers/obcq/test_obcq.py +++ b/tests/sparseml/transformers/obcq/test_obcq.py @@ -16,96 +16,48 @@ import pytest import torch -from transformers import AutoTokenizer from sparseml.core import ModifiableModel from sparseml.core.framework import Framework from sparseml.core.state import State from sparseml.modifiers.obcq import SparseGPTModifier from sparseml.modifiers.obcq.pytorch import SparseGPTModifierPyTorch -from sparseml.modifiers.obcq.utils.helpers import ppl_eval_general +from sparseml.pytorch.model_load.helpers import get_session_model from sparseml.pytorch.utils.helpers import tensor_sparsity -from sparseml.transformers import SparseAutoModelForCausalLM -from sparseml.transformers.finetune.data import TextGenerationDataset -from sparseml.transformers.finetune.data.data_args import DataTrainingArguments -from sparseml.transformers.finetune.data.data_helpers import format_calibration_data -from sparseml.transformers.sparsification.obcq.obcq import one_shot -from sparseml.transformers.sparsification.obcq.utils.helpers import llama_forward -from sparseml.transformers.utils.helpers import resolve_sequence_length -from sparseml.transformers.utils.initializers import ( - initialize_config, - initialize_sparse_model, -) +from sparseml.transformers import SparseAutoModelForCausalLM, oneshot @pytest.mark.parametrize( "recipe_file_path", [ - "tests/sparseml/transformers/obcq/test_tiny.yaml", - "tests/sparseml/transformers/obcq/test_tiny2.yaml", - "tests/sparseml/transformers/obcq/test_tiny_w_head.yaml", + "tests/sparseml/transformers/obcq/sparse.yaml", + "tests/sparseml/transformers/obcq/quant.yaml", + "tests/sparseml/transformers/obcq/quant_and_sparse.yaml", ], ) def test_obcq_tinystories(recipe_file_path): tiny_model_path = "Xenova/llama2.c-stories15M" device = "cuda:0" - num_samples = 64 - dataset = "open_platypus" if not torch.cuda.is_available(): device = "cpu" - config = initialize_config(model_path=tiny_model_path) - # test recipe with 50% sparsity, quantization and smoothquant - tiny_model = one_shot( - model_path=tiny_model_path, - dataset=dataset, - num_samples=num_samples, - device=device, - recipe_file=recipe_file_path, - ) - - data_args = DataTrainingArguments( - dataset=dataset, - max_seq_length=resolve_sequence_length(config), - num_calibration_samples=num_samples, - concatenate_data=False, + oneshot( + model=tiny_model_path, + dataset="open_platypus", + oneshot_device=device, + recipe=recipe_file_path, + max_seq_length=128, + num_calibration_samples=64, pad_to_max_length=False, ) - tokenizer = AutoTokenizer.from_pretrained( - tiny_model_path, use_fast=True, trust_remote_code=True - ) - dataset_manager = TextGenerationDataset.load_from_registry( - dataset, data_args=data_args, split="train", tokenizer=tokenizer - ) - raw_dataset = dataset_manager.get_raw_dataset() - tokenized_dataset = dataset_manager.tokenize_and_process(raw_dataset) - test_data = format_calibration_data( - tokenized_dataset=tokenized_dataset, num_calibration_samples=num_samples - ) - test_data = [d["input_ids"] for d in test_data] - perplexity = ppl_eval_general( - llama_forward, tiny_model, test_data, device, max_samples_per_iteration=8 - ) - - # we aren't expecting good results from this tiny model, but this should catch any - # egregious errors with the OBCQ algorithm - assert perplexity < 10000.0 - def test_lm_head_target(): tiny_model_path = "Xenova/llama2.c-stories15M" device = "cuda:0" if not torch.cuda.is_available(): device = "cpu" - - config = initialize_config(model_path=tiny_model_path) - model = initialize_sparse_model( - model_path=tiny_model_path, - device=device, - task="text-generation", - config=config, - ) + model = SparseAutoModelForCausalLM.from_pretrained(tiny_model_path) kwargs = { "sparsity": 0.5, @@ -140,25 +92,30 @@ def test_lm_head_target(): def test_sparsities(): tiny_model_path = "Xenova/llama2.c-stories15M" - lm_head_recipe = "tests/sparseml/transformers/obcq/test_tiny_w_head.yaml" + recipe = "tests/sparseml/transformers/obcq/sparse.yaml" device = "cuda:0" if not torch.cuda.is_available(): device = "cpu" # test recipe with 50% sparsity, quantization and smoothquant - tiny_model = one_shot( - model_path=tiny_model_path, + oneshot( + model=tiny_model_path, dataset="open_platypus", - num_samples=64, - device=device, - recipe_file=lm_head_recipe, + oneshot_device=device, + recipe=recipe, + max_seq_length=128, + num_calibration_samples=64, + pad_to_max_length=False, + clear_sparse_session=False, ) - lm_head_sparsity = tensor_sparsity(tiny_model.lm_head.weight) + model = get_session_model() + + lm_head_sparsity = tensor_sparsity(model.lm_head.weight) assert math.isclose(lm_head_sparsity.item(), 0.3, rel_tol=1e-4) - layer_1_sparse = tensor_sparsity(tiny_model.model.layers[1].self_attn.k_proj.weight) + layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight) assert math.isclose(layer_1_sparse.item(), 0.3, rel_tol=1e-4) - layer_2_dense = tensor_sparsity(tiny_model.model.layers[2].self_attn.k_proj.weight) + layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight) assert math.isclose(layer_2_dense.item(), 0.0, rel_tol=1e-4) diff --git a/tests/sparseml/transformers/obcq/test_repeats.py b/tests/sparseml/transformers/obcq/test_repeats.py index d4b2d2ee5a0..f7267ac3d4d 100644 --- a/tests/sparseml/transformers/obcq/test_repeats.py +++ b/tests/sparseml/transformers/obcq/test_repeats.py @@ -19,8 +19,9 @@ import yaml import sparseml.core.session as session_manager +from sparseml.pytorch.model_load.helpers import get_session_model from sparseml.pytorch.utils.helpers import tensor_sparsity -from sparseml.transformers.sparsification.obcq.obcq import one_shot +from sparseml.transformers import oneshot from sparseml.utils.pytorch import qat_active @@ -32,22 +33,23 @@ def test_consecutive_runs(tmp_path): tiny_model_path = "Xenova/llama2.c-stories15M" - first_recipe = "tests/sparseml/transformers/obcq/test_tiny.yaml" - second_recipe = "tests/sparseml/transformers/obcq/test_additional_sparsity.yaml" + first_recipe = "tests/sparseml/transformers/obcq/quant_and_sparse.yaml" + second_recipe = "tests/sparseml/transformers/obcq/additional_sparsity.yaml" device = "cuda:0" if not torch.cuda.is_available(): device = "cpu" # test recipe with 50% sparsity, quantization and smoothquant - first_tiny_model = one_shot( - model_path=tiny_model_path, + oneshot( + model=tiny_model_path, dataset="open_platypus", - num_samples=16, - device=device, - recipe_file=first_recipe, - deploy_dir=tmp_path / "test1", - do_save=True, + num_calibration_samples=16, + recipe=first_recipe, + output_dir=tmp_path / "test1", + oneshot_device=device, + clear_sparse_session=False, ) + first_tiny_model = get_session_model() layer_0_sparse = tensor_sparsity( first_tiny_model.model.layers[0].self_attn.k_proj.module.weight ) @@ -61,15 +63,17 @@ def test_consecutive_runs(tmp_path): session.reset() # reload saved model and up sparsity to 0.7 - second_tiny_model = one_shot( - model_path=tmp_path / "test1" / "obcq_deployment", + oneshot( + model=tmp_path / "test1", dataset="open_platypus", - num_samples=16, - device=device, - recipe_file=second_recipe, - deploy_dir=tmp_path / "test2", - do_save=True, + num_calibration_samples=16, + recipe=second_recipe, + output_dir=tmp_path / "test2", + oneshot_device=device, + clear_sparse_session=False, ) + + second_tiny_model = get_session_model() layer_0_sparse = tensor_sparsity( second_tiny_model.model.layers[0].self_attn.k_proj.module.weight ) @@ -81,7 +85,7 @@ def test_consecutive_runs(tmp_path): stages = [stage.group for stage in session_recipe.stages] assert len(stages) == 2 - recipe_path = tmp_path / "test2" / "obcq_deployment" / "recipe.yaml" + recipe_path = tmp_path / "test2" / "recipe.yaml" recipe_data = yaml.safe_load(recipe_path.read_text()) stage_keys = recipe_data.keys() assert len(stage_keys) == 2 @@ -119,14 +123,14 @@ def test_fail_on_repeated_quant(tmp_path): if not torch.cuda.is_available(): device = "cpu" - one_shot( - model_path=tiny_model_path, + oneshot( + model=tiny_model_path, dataset="open_platypus", - num_samples=4, - device=device, - recipe_file=first_recipe_str, - deploy_dir=tmp_path, - do_save=True, + num_calibration_samples=4, + oneshot_device=device, + recipe=first_recipe_str, + output_dir=tmp_path / "test", + clear_sparse_session=False, ) session = session_manager.active_session() @@ -135,12 +139,12 @@ def test_fail_on_repeated_quant(tmp_path): # When trying to re-quantize with the second recipe, we should error out # to avoid nested quantizations with pytest.raises(RuntimeError): - one_shot( - model_path=tmp_path / "obcq_deployment", + oneshot( + model=tmp_path / "test", dataset="open_platypus", - num_samples=4, - device=device, - recipe_file=second_recipe_str, + num_calibration_samples=4, + oneshot_device=device, + recipe=second_recipe_str, ) @@ -182,17 +186,17 @@ def test_separate_quants_allowed(tmp_path): if not torch.cuda.is_available(): device = "cpu" - first_model = one_shot( - model_path=tiny_model_path, + oneshot( + model=tiny_model_path, dataset="open_platypus", - num_samples=4, - device=device, - recipe_file=first_recipe_str, - deploy_dir=tmp_path, - do_save=True, + num_calibration_samples=16, + recipe=first_recipe_str, + output_dir=tmp_path / "test1", + oneshot_device=device, + clear_sparse_session=False, ) - # only embedding quantized after first recipe + first_model = get_session_model() assert not isinstance( first_model.model.layers[0].mlp.down_proj, torch_quantization.QuantWrapper ) @@ -202,14 +206,17 @@ def test_separate_quants_allowed(tmp_path): # When trying to re-quantize with the second recipe, we should error out # to avoid nested quantizations - second_model = one_shot( - model_path=tmp_path / "obcq_deployment", + oneshot( + model=tmp_path / "test1", dataset="open_platypus", - num_samples=4, - device=device, - recipe_file=second_recipe_str, + num_calibration_samples=16, + recipe=second_recipe_str, + output_dir=tmp_path / "test2", + oneshot_device=device, + clear_sparse_session=False, ) + second_model = get_session_model() # linear and embeddings should be quantized now assert isinstance( second_model.model.layers[0].mlp.down_proj, torch_quantization.QuantWrapper diff --git a/tests/sparseml/transformers/obcq/test_tiny2.yaml b/tests/sparseml/transformers/obcq/test_tiny2.yaml index ca3c9e8b4c9..f513b7e0c4f 100644 --- a/tests/sparseml/transformers/obcq/test_tiny2.yaml +++ b/tests/sparseml/transformers/obcq/test_tiny2.yaml @@ -6,8 +6,7 @@ test_stage: sequential_update: False quantize: False percdamp: 0.01 - prunen: 0 - prunem: 0 + mask_structure: "0:0" targets: [ "model.layers.0", "model.layers.1",