diff --git a/vllm/config.py b/vllm/config.py index 9ee798dc017de..901b360b2864d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1132,7 +1132,7 @@ class SchedulerConfig: # Maximum number of prefill sequences to be # processed in a single iteration. Used only with padding-aware # scheduling. - max_num_prefill_seqs: Optional[int] = None, + max_num_prefill_seqs: Optional[int] = None # If True, scheduler will consider padded # tokens in prefill. diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index ecd42bbb7a773..f71837c482d9f 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -30,19 +30,6 @@ def __init__(self, strict_mode: bool = False): self.num_emitted_tokens: Optional[torch.Tensor] = None self.num_draft_tokens: int = 0 - def init_tensors(self, - device: Union[int, str], - device_type: str = 'cuda') -> None: - assert self.num_accepted_tokens is None - if isinstance(device, int): - device = f"{device_type}:{device}" - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - def init_tensors(self, device: Union[int, str], device_type: Union[torch.device, str] = 'cuda') -> None: diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 0a726f8cc23ee..c7d81fd14dfde 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -78,14 +78,6 @@ def __init__(self, self._rejsample_metrics_collect_interval_s = collect_interval_s self._last_metrics_collect_time = self._timer() - def init_tensors(self, rank: int, device: torch.device) -> None: - self._rank = rank - if device.type == 'hpu': - import habana_frameworks.torch as htorch - self._copy_stream = htorch.hpu.Stream() - else: - self._copy_stream = torch.cuda.Stream() - def init_tensors(self, rank: int, device_type: Union[torch.device, str] = 'cuda') -> None: @@ -94,6 +86,9 @@ def init_tensors(self, device_type = device_type.type if device_type == 'cuda': self._copy_stream = torch.cuda.Stream() + elif device_type == 'hpu': + import habana_frameworks.torch as htorch + self._copy_stream = htorch.hpu.Stream() def maybe_collect_rejsample_metrics( self, k: int) -> Optional[SpecDecodeWorkerMetrics]: