Skip to content

Commit

Permalink
Reduce block fragmentation (HabanaAI#426)
Browse files Browse the repository at this point in the history
Change `NaiveBlockAllocator` to use a priority queue so that we always
allocate the lowest block id first.

This further increases the performance of contiguous paged attention.

- [ ] Add an option or env variable to enable/disable this behavior.
(Not sure if this is necessary)

---------

Co-authored-by: Yang Wang <[email protected]>
  • Loading branch information
yangw1234 and yangw1234 authored Oct 31, 2024
1 parent d3257b2 commit d42c2a2
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions vllm/core/block/naive_block.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from collections import deque
import heapq
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple

from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
Expand Down Expand Up @@ -36,7 +36,8 @@ def __init__(
if block_ids is None:
block_ids = range(num_blocks)

self._free_block_indices: Deque[BlockId] = deque(block_ids)
self._free_block_indices: Deque[BlockId] = block_ids[:]
heapq.heapify(self._free_block_indices)
self._all_block_indices = frozenset(block_ids)
assert len(self._all_block_indices) == num_blocks

Expand Down Expand Up @@ -129,7 +130,7 @@ def _allocate_block_id(self) -> BlockId:
if not self._free_block_indices:
raise BlockAllocator.NoFreeBlocksError()

block_id = self._free_block_indices.popleft()
block_id = heapq.heappop(self._free_block_indices)
self._refcounter.incr(block_id)
return block_id

Expand All @@ -139,7 +140,7 @@ def _free_block_id(self, block: Block) -> None:

refcount = self._refcounter.decr(block_id)
if refcount == 0:
self._free_block_indices.appendleft(block_id)
heapq.heappush(self._free_block_indices, block_id)

block.block_id = None

Expand Down

0 comments on commit d42c2a2

Please sign in to comment.