Reduce block fragmentation (HabanaAI#426)

Change `NaiveBlockAllocator` to use a priority queue so that we always allocate the lowest block id first. This further increases the performance of contiguous paged attention. - [ ] Add an option or env variable to enable/disable this behavior. (Not sure if this is necessary) --------- Co-authored-by: Yang Wang <[email protected]>
zhouyu5 · Oct 31, 2024 · d42c2a2 · d42c2a2
1 parent d3257b2
commit d42c2a2
Showing 1 changed file with 5 additions and 4 deletions.
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
@@ -1,4 +1,4 @@
-from collections import deque
+import heapq
 from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple
 
 from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
@@ -36,7 +36,8 @@ def __init__(
         if block_ids is None:
             block_ids = range(num_blocks)
 
-        self._free_block_indices: Deque[BlockId] = deque(block_ids)
+        self._free_block_indices: Deque[BlockId] = block_ids[:]
+        heapq.heapify(self._free_block_indices)
         self._all_block_indices = frozenset(block_ids)
         assert len(self._all_block_indices) == num_blocks
 
@@ -129,7 +130,7 @@ def _allocate_block_id(self) -> BlockId:
         if not self._free_block_indices:
             raise BlockAllocator.NoFreeBlocksError()
 
-        block_id = self._free_block_indices.popleft()
+        block_id = heapq.heappop(self._free_block_indices)
         self._refcounter.incr(block_id)
         return block_id
 
@@ -139,7 +140,7 @@ def _free_block_id(self, block: Block) -> None:
 
         refcount = self._refcounter.decr(block_id)
         if refcount == 0:
-            self._free_block_indices.appendleft(block_id)
+            heapq.heappush(self._free_block_indices, block_id)
 
         block.block_id = None