From 907688f43dd777eca666f86a8fa330b8721be19d Mon Sep 17 00:00:00 2001
From: Nicholas Sielicki <nslick@amazon.com>
Date: Sun, 8 Sep 2024 09:51:04 -0700
Subject: [PATCH] hmem/cuda: avoid stub loading at runtime

When the CUDA toolkit is installed, a set of "stub" libraries are
installed under /usr/local/cuda*/lib64/stubs/. These libraries include a
SONAME field with a `.1' suffix, but the filenames of these stubs are
bare. eg:

 > $ readelf -d /usr/local/cuda-12.5/lib64/stubs/libnvidia-ml.so | grep soname
 > 0x000000000000000e (SONAME)  Library soname: [libnvidia-ml.so.1]

The CUDA toolkit does not include any library file with the name
`libnvidia-ml.so.1` (or `libcuda.so.1`, etc.), as these are provided by
the driver package. This disconnect between the stub filename in the
toolkit and the SONAME within it is done intentionally to allow linking
with the stub at build time, while ensuring it's never loaded at
runtime.

In normal dynamic linking cases (ie: without dlopen), the SONAME field
of `libnvidia-ml.so.1` is used in the DT_NEEDED tag, where that filename
can only come from a driver package and this ensures that the stub
library will never match.

Match the same behavior and provide `.1` suffixes to dlopen where
appropriate for NVIDIA libraries.

Signed-off-by: Nicholas Sielicki <nslick@amazon.com>
---
 fabtests/common/hmem_cuda.c |  4 ++--
 src/hmem_cuda.c             | 15 +++++----------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/fabtests/common/hmem_cuda.c b/fabtests/common/hmem_cuda.c
index 2f02b6f474c..e4aef962fb6 100644
--- a/fabtests/common/hmem_cuda.c
+++ b/fabtests/common/hmem_cuda.c
@@ -157,9 +157,9 @@ int ft_cuda_init(void)
 		goto err;
 	}
 
-	cuda_handle = dlopen("libcuda.so", RTLD_NOW);
+	cuda_handle = dlopen("libcuda.so.1", RTLD_NOW);
 	if (!cuda_handle) {
-		FT_ERR("Failed to dlopen libcuda.so\n");
+		FT_ERR("Failed to dlopen libcuda.so.1\n");
 		goto err_dlclose_cudart;
 	}
 
diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c
index 1c8abb03285..1e3b9cdc10c 100644
--- a/src/hmem_cuda.c
+++ b/src/hmem_cuda.c
@@ -487,22 +487,17 @@ static int cuda_hmem_dl_init(void)
 		return -FI_ENOSYS;
 	}
 
-	cuda_attr.driver_handle = dlopen("libcuda.so", RTLD_NOW);
+	cuda_attr.driver_handle = dlopen("libcuda.so.1", RTLD_NOW);
 	if (!cuda_attr.driver_handle) {
 		FI_WARN(&core_prov, FI_LOG_CORE,
-			"Failed to dlopen libcuda.so\n");
+			"Failed to dlopen libcuda.so.1\n");
 		goto err_dlclose_cuda_runtime;
 	}
 
-	cuda_attr.nvml_handle = dlopen("libnvidia-ml.so", RTLD_NOW);
+	cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
 	if (!cuda_attr.nvml_handle) {
-		FI_INFO(&core_prov, FI_LOG_CORE,
-			"Failed to dlopen libnvidia-ml.so.  Trying libnvidia-ml.so.1\n");
-		cuda_attr.nvml_handle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
-		if (!cuda_attr.nvml_handle) {
-			FI_WARN(&core_prov, FI_LOG_CORE,
-			"Failed to dlopen libnvidia-ml.so or libnvidia-ml.so.1, bypassing nvml calls\n");
-		}
+		FI_WARN(&core_prov, FI_LOG_CORE,
+			"Failed to dlopen libnvidia-ml.so.1, bypassing nvml calls\n");
 	}
 
 	CUDA_DRIVER_FUNCS_DEF(CUDA_DRIVER_FUNCS_DLOPEN)