From 1e427c23b8ce1422858ce4d42585b1cad28e2257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= Date: Fri, 25 Oct 2024 13:35:08 +0200 Subject: [PATCH] Update collect_env.py Add HPU information to collect_env.py script --- collect_env.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/collect_env.py b/collect_env.py index ae7f97f355253..443be171cd932 100644 --- a/collect_env.py +++ b/collect_env.py @@ -37,6 +37,8 @@ 'cuda_module_loading', 'nvidia_driver_version', 'nvidia_gpu_models', + 'habana_hpu_models', + 'habana_driver_version', 'cudnn_version', 'pip_version', # 'pip' or 'pip3' 'pip_packages', @@ -251,6 +253,40 @@ def get_nvidia_smi(): return smi +def get_hpu_info(): + command = ["hl-smi", "-q", "-d", "PRODUCT"] + try: + lines = subprocess.Popen(command, stdout=subprocess.PIPE, universal_newlines=True).stdout.readlines() + lines = [l.strip('\t') for l in lines] + hpu_count = None + hpu_model = None + hpu_driver = None + model_re = re.compile(r'Product Name.+?: (.+)') + count_re = re.compile(r'Attached AIPs.+?: (\d+)') + driver_re = re.compile(r'Driver Version.+?: (.+)') + for line in lines: + hpu_c = count_re.match(line) + if hpu_c: + hpu_count = hpu_c.group(1) + + hpu_m = model_re.match(line) + if hpu_m: + hpu_model = hpu_m.group(1) + + hpu_d = driver_re.match(line) + if hpu_d: + hpu_driver = hpu_d.group(1) + + if hpu_model and hpu_count and hpu_driver: + break + + if hpu_model is None: + return ('N/A', hpu_driver) + return (f'{hpu_count}x {hpu_model}', hpu_driver) + except Exception as e: + return ('N/A', 'N/A') + + def get_rocm_version(run_lambda): """Returns the ROCm version if available, otherwise 'N/A'.""" return run_and_parse_first_match(run_lambda, 'hipcc --version', @@ -556,6 +592,7 @@ def get_version_or_na(cfg, prefix): vllm_version = get_vllm_version() vllm_build_flags = summarize_vllm_build_flags() gpu_topo = get_gpu_topo(run_lambda) + hpu_info=get_hpu_info() return SystemEnv( torch_version=version_str, @@ -571,6 +608,8 @@ def get_version_or_na(cfg, prefix): nvidia_gpu_models=get_gpu_info(run_lambda), nvidia_driver_version=get_nvidia_driver_version(run_lambda), cudnn_version=get_cudnn_version(run_lambda), + habana_hpu_models=hpu_info[0], + habana_driver_version=hpu_info[1], hip_compiled_version=hip_compiled_version, hip_runtime_version=hip_runtime_version, miopen_runtime_version=miopen_runtime_version, @@ -613,6 +652,8 @@ def get_version_or_na(cfg, prefix): GPU models and configuration: {nvidia_gpu_models} Nvidia driver version: {nvidia_driver_version} cuDNN version: {cudnn_version} +HPU devices: {habana_hpu_models} +HPU driver version: {habana_driver_version} HIP runtime version: {hip_runtime_version} MIOpen runtime version: {miopen_runtime_version} Is XNNPACK available: {is_xnnpack_available}