neuralmagic · bfineran · Apr 10, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 9, 2024
diff --git a/...ormers/obcq/test_additional_sparsity.yaml → ...ransformers/obcq/additional_sparsity.yaml b/...ormers/obcq/test_additional_sparsity.yaml → ...ransformers/obcq/additional_sparsity.yaml
diff --git a/tests/sparseml/transformers/obcq/quant.yaml b/tests/sparseml/transformers/obcq/quant.yaml
@@ -0,0 +1,41 @@
+test_stage:
+  obcq_modifiers:
+    SmoothQuantModifier:
+      smoothing_strength: 0.5
+      mappings: [
+        [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
+        [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
+      ]
+    QuantizationModifier:
+      ignore:
+        - LlamaRotaryEmbedding
+        - LlamaRMSNorm
+        - SiLU
+        - model.layers.0.mlp.down_proj
+        - model.layers.1.mlp.down_proj
+        - model.layers.2.mlp.down_proj
+        - model.layers.3.mlp.down_proj
+        - model.layers.4.mlp.down_proj
+        - model.layers.5.mlp.down_proj
+      scheme_overrides:
+        Embedding:
+          input_activations: null
+          weights:
+            num_bits: 8
+            symmetric: False
+    SparseGPTModifier:
+      sparsity: 0.0
+      block_size: 128
+      sequential_update: False
+      quantize: True
+      percdamp: 0.01
+      prunen: 0
+      prunem: 0
+      targets: [
+        "model.layers.0",
+        "model.layers.1",
+        "model.layers.2",
+        "model.layers.3",
+        "model.layers.4",
+        "model.layers.5"
+      ]
diff --git a/...sparseml/transformers/obcq/test_tiny.yaml → ...l/transformers/obcq/quant_and_sparse.yaml b/...sparseml/transformers/obcq/test_tiny.yaml → ...l/transformers/obcq/quant_and_sparse.yaml
diff --git a/...l/transformers/obcq/test_tiny_w_head.yaml → tests/sparseml/transformers/obcq/sparse.yaml b/...l/transformers/obcq/test_tiny_w_head.yaml → tests/sparseml/transformers/obcq/sparse.yaml
diff --git a/tests/sparseml/transformers/obcq/test_obcq.py b/tests/sparseml/transformers/obcq/test_obcq.py
@@ -16,96 +16,48 @@
 
 import pytest
 import torch
-from transformers import AutoTokenizer
 
 from sparseml.core import ModifiableModel
 from sparseml.core.framework import Framework
 from sparseml.core.state import State
 from sparseml.modifiers.obcq import SparseGPTModifier
 from sparseml.modifiers.obcq.pytorch import SparseGPTModifierPyTorch
-from sparseml.modifiers.obcq.utils.helpers import ppl_eval_general
+from sparseml.pytorch.model_load.helpers import get_session_model
 from sparseml.pytorch.utils.helpers import tensor_sparsity
-from sparseml.transformers import SparseAutoModelForCausalLM
-from sparseml.transformers.finetune.data import TextGenerationDataset
-from sparseml.transformers.finetune.data.data_args import DataTrainingArguments
-from sparseml.transformers.finetune.data.data_helpers import format_calibration_data
-from sparseml.transformers.sparsification.obcq.obcq import one_shot
-from sparseml.transformers.sparsification.obcq.utils.helpers import llama_forward
-from sparseml.transformers.utils.helpers import resolve_sequence_length
-from sparseml.transformers.utils.initializers import (
-    initialize_config,
-    initialize_sparse_model,
-)
+from sparseml.transformers import SparseAutoModelForCausalLM, oneshot
 
 
 @pytest.mark.parametrize(
     "recipe_file_path",
     [
-        "tests/sparseml/transformers/obcq/test_tiny.yaml",
-        "tests/sparseml/transformers/obcq/test_tiny2.yaml",
-        "tests/sparseml/transformers/obcq/test_tiny_w_head.yaml",
+        "tests/sparseml/transformers/obcq/sparse.yaml",
+        "tests/sparseml/transformers/obcq/quant.yaml",
+        "tests/sparseml/transformers/obcq/quant_and_sparse.yaml",
     ],
 )
 def test_obcq_tinystories(recipe_file_path):
     tiny_model_path = "Xenova/llama2.c-stories15M"
     device = "cuda:0"
-    num_samples = 64
-    dataset = "open_platypus"
     if not torch.cuda.is_available():
         device = "cpu"
-    config = initialize_config(model_path=tiny_model_path)
 
-    # test recipe with 50% sparsity, quantization and smoothquant
-    tiny_model = one_shot(
-        model_path=tiny_model_path,
-        dataset=dataset,
-        num_samples=num_samples,
-        device=device,
-        recipe_file=recipe_file_path,
-    )
-
-    data_args = DataTrainingArguments(
-        dataset=dataset,
-        max_seq_length=resolve_sequence_length(config),
-        num_calibration_samples=num_samples,
-        concatenate_data=False,
+    oneshot(
+        model=tiny_model_path,
+        dataset="open_platypus",
+        oneshot_device=device,
+        recipe=recipe_file_path,
+        max_seq_length=128,
+        num_calibration_samples=64,
         pad_to_max_length=False,
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        tiny_model_path, use_fast=True, trust_remote_code=True
-    )
-    dataset_manager = TextGenerationDataset.load_from_registry(
-        dataset, data_args=data_args, split="train", tokenizer=tokenizer
-    )
-    raw_dataset = dataset_manager.get_raw_dataset()
-    tokenized_dataset = dataset_manager.tokenize_and_process(raw_dataset)
-    test_data = format_calibration_data(
-        tokenized_dataset=tokenized_dataset, num_calibration_samples=num_samples
-    )
-    test_data = [d["input_ids"] for d in test_data]
-    perplexity = ppl_eval_general(
-        llama_forward, tiny_model, test_data, device, max_samples_per_iteration=8
-    )
-
-    # we aren't expecting good results from this tiny model, but this should catch any
-    # egregious errors with the OBCQ algorithm
-    assert perplexity < 10000.0
-
 
 def test_lm_head_target():
     tiny_model_path = "Xenova/llama2.c-stories15M"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
-
-    config = initialize_config(model_path=tiny_model_path)
-    model = initialize_sparse_model(
-        model_path=tiny_model_path,
-        device=device,
-        task="text-generation",
-        config=config,
-    )
+    model = SparseAutoModelForCausalLM.from_pretrained(tiny_model_path)
 
     kwargs = {
         "sparsity": 0.5,
@@ -140,25 +92,30 @@ def test_lm_head_target():
 
 def test_sparsities():
     tiny_model_path = "Xenova/llama2.c-stories15M"
-    lm_head_recipe = "tests/sparseml/transformers/obcq/test_tiny_w_head.yaml"
+    recipe = "tests/sparseml/transformers/obcq/sparse.yaml"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
 
     # test recipe with 50% sparsity, quantization and smoothquant
-    tiny_model = one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=64,
-        device=device,
-        recipe_file=lm_head_recipe,
+        oneshot_device=device,
+        recipe=recipe,
+        max_seq_length=128,
+        num_calibration_samples=64,
+        pad_to_max_length=False,
+        clear_sparse_session=False,
     )
 
-    lm_head_sparsity = tensor_sparsity(tiny_model.lm_head.weight)
+    model = get_session_model()
+
+    lm_head_sparsity = tensor_sparsity(model.lm_head.weight)
     assert math.isclose(lm_head_sparsity.item(), 0.3, rel_tol=1e-4)
-    layer_1_sparse = tensor_sparsity(tiny_model.model.layers[1].self_attn.k_proj.weight)
+    layer_1_sparse = tensor_sparsity(model.model.layers[1].self_attn.k_proj.weight)
     assert math.isclose(layer_1_sparse.item(), 0.3, rel_tol=1e-4)
-    layer_2_dense = tensor_sparsity(tiny_model.model.layers[2].self_attn.k_proj.weight)
+    layer_2_dense = tensor_sparsity(model.model.layers[2].self_attn.k_proj.weight)
     assert math.isclose(layer_2_dense.item(), 0.0, rel_tol=1e-4)
 
 

diff --git a/tests/sparseml/transformers/obcq/test_repeats.py b/tests/sparseml/transformers/obcq/test_repeats.py
@@ -19,8 +19,9 @@
 import yaml
 
 import sparseml.core.session as session_manager
+from sparseml.pytorch.model_load.helpers import get_session_model
 from sparseml.pytorch.utils.helpers import tensor_sparsity
-from sparseml.transformers.sparsification.obcq.obcq import one_shot
+from sparseml.transformers import oneshot
 from sparseml.utils.pytorch import qat_active
 
 
@@ -32,22 +33,23 @@
 
 def test_consecutive_runs(tmp_path):
     tiny_model_path = "Xenova/llama2.c-stories15M"
-    first_recipe = "tests/sparseml/transformers/obcq/test_tiny.yaml"
-    second_recipe = "tests/sparseml/transformers/obcq/test_additional_sparsity.yaml"
+    first_recipe = "tests/sparseml/transformers/obcq/quant_and_sparse.yaml"
+    second_recipe = "tests/sparseml/transformers/obcq/additional_sparsity.yaml"
     device = "cuda:0"
     if not torch.cuda.is_available():
         device = "cpu"
 
     # test recipe with 50% sparsity, quantization and smoothquant
-    first_tiny_model = one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=16,
-        device=device,
-        recipe_file=first_recipe,
-        deploy_dir=tmp_path / "test1",
-        do_save=True,
+        num_calibration_samples=16,
+        recipe=first_recipe,
+        output_dir=tmp_path / "test1",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
+    first_tiny_model = get_session_model()
     layer_0_sparse = tensor_sparsity(
         first_tiny_model.model.layers[0].self_attn.k_proj.module.weight
     )
@@ -61,15 +63,17 @@ def test_consecutive_runs(tmp_path):
     session.reset()
 
     # reload saved model and up sparsity to 0.7
-    second_tiny_model = one_shot(
-        model_path=tmp_path / "test1" / "obcq_deployment",
+    oneshot(
+        model=tmp_path / "test1",
         dataset="open_platypus",
-        num_samples=16,
-        device=device,
-        recipe_file=second_recipe,
-        deploy_dir=tmp_path / "test2",
-        do_save=True,
+        num_calibration_samples=16,
+        recipe=second_recipe,
+        output_dir=tmp_path / "test2",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
+
+    second_tiny_model = get_session_model()
     layer_0_sparse = tensor_sparsity(
         second_tiny_model.model.layers[0].self_attn.k_proj.module.weight
     )
@@ -81,7 +85,7 @@ def test_consecutive_runs(tmp_path):
     stages = [stage.group for stage in session_recipe.stages]
     assert len(stages) == 2
 
-    recipe_path = tmp_path / "test2" / "obcq_deployment" / "recipe.yaml"
+    recipe_path = tmp_path / "test2" / "recipe.yaml"
     recipe_data = yaml.safe_load(recipe_path.read_text())
     stage_keys = recipe_data.keys()
     assert len(stage_keys) == 2
@@ -119,14 +123,14 @@ def test_fail_on_repeated_quant(tmp_path):
     if not torch.cuda.is_available():
         device = "cpu"
 
-    one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=4,
-        device=device,
-        recipe_file=first_recipe_str,
-        deploy_dir=tmp_path,
-        do_save=True,
+        num_calibration_samples=4,
+        oneshot_device=device,
+        recipe=first_recipe_str,
+        output_dir=tmp_path / "test",
+        clear_sparse_session=False,
     )
 
     session = session_manager.active_session()
@@ -135,12 +139,12 @@ def test_fail_on_repeated_quant(tmp_path):
     # When trying to re-quantize with the second recipe, we should error out
     # to avoid nested quantizations
     with pytest.raises(RuntimeError):
-        one_shot(
-            model_path=tmp_path / "obcq_deployment",
+        oneshot(
+            model=tmp_path / "test",
             dataset="open_platypus",
-            num_samples=4,
-            device=device,
-            recipe_file=second_recipe_str,
+            num_calibration_samples=4,
+            oneshot_device=device,
+            recipe=second_recipe_str,
         )
 
 
@@ -182,17 +186,17 @@ def test_separate_quants_allowed(tmp_path):
     if not torch.cuda.is_available():
         device = "cpu"
 
-    first_model = one_shot(
-        model_path=tiny_model_path,
+    oneshot(
+        model=tiny_model_path,
         dataset="open_platypus",
-        num_samples=4,
-        device=device,
-        recipe_file=first_recipe_str,
-        deploy_dir=tmp_path,
-        do_save=True,
+        num_calibration_samples=16,
+        recipe=first_recipe_str,
+        output_dir=tmp_path / "test1",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
-
     # only embedding quantized after first recipe
+    first_model = get_session_model()
     assert not isinstance(
         first_model.model.layers[0].mlp.down_proj, torch_quantization.QuantWrapper
     )
@@ -202,14 +206,17 @@ def test_separate_quants_allowed(tmp_path):
 
     # When trying to re-quantize with the second recipe, we should error out
     # to avoid nested quantizations
-    second_model = one_shot(
-        model_path=tmp_path / "obcq_deployment",
+    oneshot(
+        model=tmp_path / "test1",
         dataset="open_platypus",
-        num_samples=4,
-        device=device,
-        recipe_file=second_recipe_str,
+        num_calibration_samples=16,
+        recipe=second_recipe_str,
+        output_dir=tmp_path / "test2",
+        oneshot_device=device,
+        clear_sparse_session=False,
     )
 
+    second_model = get_session_model()
     # linear and embeddings should be quantized now
     assert isinstance(
         second_model.model.layers[0].mlp.down_proj, torch_quantization.QuantWrapper