diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml index 529af9fc7b1ec..64bdd36a6e3eb 100644 --- a/.github/workflows/cpu-test.yml +++ b/.github/workflows/cpu-test.yml @@ -31,4 +31,4 @@ jobs: VLLM_TARGET_DEVICE=hpu python setup.py develop - name: cpu-test run: | - VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 python examples/offline_inference_fakehpu.py --fake_hpu + VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py --fake_hpu diff --git a/vllm/config.py b/vllm/config.py index 6acb70ad047b2..839a00ef0a4ca 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -12,7 +12,7 @@ from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_hpu, is_neuron, is_openvino, is_tpu, is_xpu, + is_hip, is_hpu, is_fake_hpu, is_neuron, is_openvino, is_tpu, is_xpu, print_warning_once) if TYPE_CHECKING: @@ -858,6 +858,8 @@ def __init__(self, device: str = "auto") -> None: # Automated device type detection if is_neuron(): self.device_type = "neuron" + elif is_fake_hpu(): + self.device_type = "cpu" elif is_hpu(): self.device_type = "hpu" elif is_openvino(): diff --git a/vllm/utils.py b/vllm/utils.py index ae0fe26010f06..facdb30ec8e93 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -219,6 +219,7 @@ def is_fake_hpu() -> bool: @lru_cache(maxsize=None) def _is_habana_frameworks_installed() -> bool: + if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0' : return False from importlib import util return util.find_spec('habana_frameworks') is not None @@ -997,7 +998,9 @@ def cuda_device_count_stateless() -> int: def get_device() -> str: - if is_hpu(): + if is_fake_hpu(): + return "cpu" + elif is_hpu(): return "hpu" return "cuda" @@ -1143,7 +1146,6 @@ def _return_false(): def _migrate_to_cpu(): import habana_frameworks.torch as htorch - htorch.core.mark_step = _do_nothing htorch.utils.internal.is_lazy = _return_false torch.hpu.synchronize = _do_nothing