From 0272ab68937ead7ba0b5342d3452f37b9776ff73 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Dec 2024 14:22:13 +0200 Subject: [PATCH 1/3] Enable padding aware scheduling by default --- vllm/engine/arg_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5a64741f3b709..c3dd364382741 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -119,7 +119,8 @@ class EngineArgs: enable_prefix_caching: Optional[bool] = None disable_sliding_window: bool = False use_v2_block_manager: bool = True - use_padding_aware_scheduling: bool = False + use_padding_aware_scheduling: bool = False if \ + not current_platform.is_hpu() else True swap_space: float = 4 # GiB cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90 From 57c16cc87c2ec2851e54318a732225bbed92bfb5 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Dec 2024 14:23:29 +0200 Subject: [PATCH 2/3] Enable padding aware scheduling by default on HPU --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c3dd364382741..e5fc82128bb41 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -455,7 +455,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action='store_true', help=('Use padding-aware scheduling. If True, the scheduler ' 'will consider padded tokens in prefill. ' - 'By default this is set to False. ')) + 'By default this is set to False on non-HPU devices. ')) parser.add_argument( '--num-lookahead-slots', type=int, From 72cc90972cacf4af8ea69366a6043e62b6636acf Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Dec 2024 14:28:24 +0200 Subject: [PATCH 3/3] ruff stuff --- vllm/engine/arg_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e5fc82128bb41..8fd96aad25357 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -119,8 +119,7 @@ class EngineArgs: enable_prefix_caching: Optional[bool] = None disable_sliding_window: bool = False use_v2_block_manager: bool = True - use_padding_aware_scheduling: bool = False if \ - not current_platform.is_hpu() else True + use_padding_aware_scheduling: bool = current_platform.is_hpu() swap_space: float = 4 # GiB cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90