pytorch · gau-nernst · Sep 12, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
@@ -212,6 +212,7 @@ def setup(self, cfg: DictConfig) -> None:
             enable_activation_checkpointing=cfg.enable_activation_checkpointing,
             compile_model=self._compile,
             model_state_dict=ckpt_dict[training.MODEL_KEY],
+            quantizer_cfg=cfg.get("quantizer", None),
         )
         self._tokenizer = config.instantiate(cfg.tokenizer)
         log.info("Tokenizer is initialized from file.")
@@ -345,6 +346,7 @@ def _setup_model(
         enable_activation_checkpointing: bool,
         compile_model: bool,
         model_state_dict: Dict[str, Any],
+        quantizer_cfg: Optional[DictConfig] = None,
     ) -> nn.Module:
         """
         Set up the model including enabling activation checkpointing.
@@ -360,6 +362,10 @@ def _setup_model(
                 model, auto_wrap_policy={modules.TransformerSelfAttentionLayer}
             )
 
+        if quantizer_cfg is not None:
-        if quantizer_cfg is not None:
+        if quantizer_cfg is not None and quantizer_cfg.get("enabled", False):
-        if quantizer_cfg is not None:
+        if quantizer_cfg is not None and quantizer_cfg.get("enabled", False):
+            quantizer = config.instantiate(quantizer_cfg)
+            model = quantizer.prepare(model)
+
         model.load_state_dict(model_state_dict)
 
         # Validate model was loaded in with the expected dtype.

diff --git a/torchtune/training/quantization.py b/torchtune/training/quantization.py
 and torch.cuda.get_device_capability() >= (7, 5) 
 and torch.cuda.get_device_capability() >= (7, 5) 
@@ -6,6 +6,11 @@
 
 from typing import Callable, Optional
 
+from torch import nn
+from torchao.prototype.quantized_training import (
+    int8_mixed_precision_training,
+    Int8MixedPrecisionTrainingConfig,
+)
 from torchao.quantization import int8_dynamic_activation_int4_weight, quantize_
 from torchao.quantization.prototype.qat import (
     disable_8da4w_fake_quant,
@@ -18,11 +23,14 @@
     Int8DynActInt4WeightQATQuantizerModuleSwap,
 )
 
+from torchtune.modules import TransformerDecoder
+
 
 __all__ = [
     "get_quantizer_mode",
     "Int8DynActInt4WeightQuantizer",
     "Int8DynActInt4WeightQATQuantizer",
+    "Int8MixedPrecisionTrainingQuantizer",
 ]
 
 
@@ -74,6 +82,45 @@ def quantize(self, model):
 ] = enable_8da4w_fake_quant_module_swap
 
 
+class Int8MixedPrecisionTrainingQuantizer:
+    """Apply INT8 mixed-precision training. During training, weights and activations
+    are dynamically quantized to INT8 to utilize INT8 tensor cores. This is also done
+    in the backward pass.
+
+    NOTE: due to the limitations of the current implementation, the following
+    requirements must be satisfied to enjoy speedup:
+
+    1. Must use ``torch.compile()`` (set ``compile=True``).
+    2. Inputs to the model must not be too dynamic e.g. input sequence length changes
+    for every batch.
+
+    To satisfy (2), you can use :class:`~torchtune.datasets.PackedDataset` (set
+    ``dataset.packed=True``), which ensures input tokens always have fixed length.
+    """
+
+    def __init__(
+        self,
+        output: bool = True,
+        grad_input: bool = True,
+        grad_weight: bool = True,
+    ) -> None:
+        self._config = Int8MixedPrecisionTrainingConfig(
+            output=output,
+            grad_input=grad_input,
+            grad_weight=grad_weight,
+        )
+
+    def prepare(self, model: nn.Module) -> nn.Module:
 class TiedLinear: 
 class TiedLinear: 
+        # don't apply INT8 mixed-precision training to LM head
+        # since speed is slightly lower.
+        quantize_fn = int8_mixed_precision_training(self._config)
+        if isinstance(model, TransformerDecoder):
+            quantize_(model.layers, quantize_fn)
+        else:
+            quantize_(model, quantize_fn)
+        return model
+
+
 def get_quantizer_mode(quantizer: Optional[Callable]) -> Optional[str]:
     """Given a quantizer object, returns a string that specifies the type of quantization.