Fixing recompiles in KV-cache + compile (#1663)

pytorch · Oct 2, 2024 · bae4b27 · bae4b27 · psoulos · Oct 9, 2024
1 parent fc0249d
commit bae4b27
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 16 deletions.
diff --git a/tests/torchtune/modules/test_kv_cache.py b/tests/torchtune/modules/test_kv_cache.py
@@ -6,6 +6,7 @@
 
 import pytest
 import torch
+import torch._dynamo.testing
 from torchtune.modules import KVCache
 
 BSZ = 2
@@ -52,6 +53,7 @@ def test_kv_cache_init(self, kv_cache):
     def test_kv_cache_reset(self, kv_cache, k_vals_full, v_vals_full):
         kv_cache.update(k_vals_full, v_vals_full)
         kv_cache.reset()
+
         assert (kv_cache.k_cache == 0).all() and (kv_cache.v_cache == 0).all()
         assert kv_cache.size == 0
 
@@ -62,7 +64,7 @@ def test_kv_cache_error_when_bsz_exceeded(self, kv_cache, k_vals_full, v_vals_fu
     def test_kv_cache_error_when_seq_len_exceeded(
         self, kv_cache, k_vals_full, v_vals_full
     ):
-        with pytest.raises(ValueError):
+        with pytest.raises(AssertionError):
             kv_cache.update(k_vals_full.repeat(1, 1, 4, 1), v_vals_full)
 
     def test_kv_cache_error_when_seq_len_exceeded_after_update(
@@ -75,8 +77,7 @@ def test_kv_cache_error_when_seq_len_exceeded_after_update(
             v_vals_full[:, :, : (MAX_SEQ_LEN // 2)],
         )
         with pytest.raises(
-            ValueError,
-            match=f"cache has reached a sequence length of {MAX_SEQ_LEN + MAX_SEQ_LEN // 2}",
+            AssertionError,
         ):
             # now an invalid update exceeding the cache
             kv_cache.update(k_vals_full, v_vals_full)
@@ -151,3 +152,28 @@ def test_kv_cache_multiple_updates(self, kv_cache, k_vals_full, v_vals_full):
 
         assert torch.equal(expected_k_out, k_out)
         assert torch.equal(expected_v_out, v_out)
+
+    def test_kv_cache_no_recompiles(self, kv_cache, k_vals_full, v_vals_full):
+        def fn(k_val, v_val):
+            return kv_cache.update(k_val, v_val)
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        # this effectively does torch.compile(fn)
+        fn = torch._dynamo.optimize(cnts, nopython=True)(fn)
+
+        # make an update filling half the cache - like a prefill
+        # fills position 0 through to (MAX_SEQ_LEN // 2) - 1
+        kv_cache.update(
+            k_vals_full[:, :, : (MAX_SEQ_LEN // 2)],
+            v_vals_full[:, :, : (MAX_SEQ_LEN // 2)],
+        )
+
+        # now make successive updates for one token position at a time
+        # and ensure there are no recompiles
+        for i in range(MAX_SEQ_LEN // 2):
+            fn(
+                k_vals_full[:, :, (MAX_SEQ_LEN // 2) + i].unsqueeze(2),
+                v_vals_full[:, :, (MAX_SEQ_LEN // 2) + i].unsqueeze(2),
+            )
+
+        assert cnts.frame_count == 1
diff --git a/torchtune/modules/kv_cache.py b/torchtune/modules/kv_cache.py
@@ -40,14 +40,20 @@ def __init__(
         self.register_buffer(
             "v_cache", torch.zeros(cache_shape, dtype=dtype), persistent=False
         )
-        self.size = 0
+        self.register_buffer(
+            "cache_pos", torch.arange(0, cache_shape[2]), persistent=False
+        )
         self.batch_size = batch_size
 
     def reset(self) -> None:
         """Reset the cache to zero."""
         self.k_cache.zero_()
         self.v_cache.zero_()
-        self.size = 0
+        self.cache_pos -= self.size
+
+    @property
+    def size(self) -> int:
+        return self.cache_pos[0].item()
 
     def update(
         self, k_val: torch.Tensor, v_val: torch.Tensor
@@ -80,7 +86,7 @@ def update(
             Tuple[torch.Tensor, torch.Tensor]: Updated key and value cache tensors, respectively.
 
         Raises:
-            ValueError: if the sequence length of ``k_val`` is longer than the maximum cache sequence length.
+            AssertionError: if the sequence length of ``k_val`` is longer than the maximum cache sequence length.
             ValueError: if the batch size of the new key (or value) tensor is greater than the batch size
                 used during cache setup.
         """
@@ -91,18 +97,20 @@ def update(
                 f", but found new key tensors with batch size {k_val.shape[0]}!"
             )
 
-        if (self.size + seq_len) > self.k_cache.shape[2]:
-            raise ValueError(
-                f"The current cache has been setup with a sequence length of {self.k_cache.shape[2]}"
-                f", but the cache has reached a sequence length of {(self.size + seq_len)}!"
-            )
-        cache_pos = torch.arange(self.size, self.size + seq_len, device=k_val.device)
-        self.size += seq_len
-
+        assert (self.cache_pos[0] + seq_len) <= self.k_cache.shape[2]
         k_out = self.k_cache
         v_out = self.v_cache
 
-        k_out[:, :, cache_pos] = k_val
-        v_out[:, :, cache_pos] = v_val
+        k_out[:, :, self.cache_pos[:seq_len]] = k_val
+        v_out[:, :, self.cache_pos[:seq_len]] = v_val
+
+        # forward cache_pos seq_len positions along
+        # cache_pos starts at (0, 1, 2, 3, 4, 5, ...)
+        # an update of seq_len = 5 tokens brings it to
+        # (5, 6, 7, 8, 9, ...)
+        # this allows us to track the current position in the cache
+        # after the last update in a compile-friendly way without any dynamism
+        # e.g. relying on an int size tracker, or re-creating cache_pos every time
+        self.cache_pos += seq_len
 
         return k_out, v_out