Skip to content

Commit

Permalink
add sparse model
Browse files Browse the repository at this point in the history
  • Loading branch information
Sara Adkins committed May 23, 2024
1 parent 2bd35e5 commit d4ea973
Showing 1 changed file with 74 additions and 0 deletions.
74 changes: 74 additions & 0 deletions examples/llama7b_quantize_sparse_cnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import torch
from datasets import load_dataset

from sparseml.transformers import (
SparseAutoModelForCausalLM,
SparseAutoTokenizer,
oneshot,
)


# define a sparseml recipe for GPTQ W4A16 quantization
recipe = """
quant_stage:
quant_modifiers:
GPTQModifier:
sequential_update: false
ignore: ["lm_head"]
config_groups:
group_0:
weights:
num_bits: 4
type: "int"
symmetric: true
strategy: "channel"
targets: ["Linear"]
"""

# load in a 50% sparse model with 2:4 sparsity structure
# setting device_map to auto to spread the model evenly across all available GPUs
model_stub = "neuralmagic/SparseLlama-2-7b-cnn-daily-mail-pruned_50.2of4"
model = SparseAutoModelForCausalLM.from_pretrained(
model_stub, torch_dtype=torch.bfloat16, device_map="auto"
)
tokenizer = SparseAutoTokenizer.from_pretrained(model_stub)

# for quantization calibration, we will use a subset of the dataset that was used to
# sparsity and finetune the model
dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split="train[:5%]")

# set dataset config parameters
max_seq_length = 1024
pad_to_max_length = False
num_calibration_samples = 512


# preprocess the data into a single text entry, then tokenize the dataset
def process_sample(sample):
formatted = "Article:\n{}\n\n### Summarization:\n{}".format(
sample["article"], sample["highlights"]
)
return tokenizer(
formatted, padding=pad_to_max_length, max_length=max_seq_length, truncation=True
)


tokenized_dataset = dataset.map(
process_sample, remove_columns=["article", "highlights", "id"]
)

# save location of quantized model out
output_dir = "/network/sadkins/llama7b_sparse_24_w4a16_channel_compressed"

# apply quantization recipe to the model and save quantized output int4 packed format
# the sparsity structure of the original model will be maintained
oneshot(
model=model,
dataset=tokenized_dataset,
recipe=recipe,
output_dir=output_dir,
max_seq_length=max_seq_length,
pad_to_max_length=pad_to_max_length,
num_calibration_samples=num_calibration_samples,
save_compressed=True,
)

0 comments on commit d4ea973

Please sign in to comment.