From df325373126485ef37bfb23c1ccf59e4beec355a Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 5 Jan 2024 11:25:58 -0500 Subject: [PATCH] gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments --- gpu/gpu_info_cuda.c | 41 ++++++++++++++++++++++++++++------------- gpu/gpu_info_cuda.h | 1 + 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c index 52738710..c81293aa 100644 --- a/gpu/gpu_info_cuda.c +++ b/gpu/gpu_info_cuda.c @@ -20,6 +20,8 @@ const char *cuda_lib_paths[] = { }; #endif +#define CUDA_LOOKUP_SIZE 5 + void cuda_init(cuda_init_resp_t *resp) { nvmlReturn_t ret; resp->err = NULL; @@ -30,11 +32,12 @@ void cuda_init(cuda_init_resp_t *resp) { struct lookup { char *s; void **p; - } l[4] = { + } l[CUDA_LOOKUP_SIZE] = { {"nvmlInit_v2", (void *)&resp->ch.initFn}, {"nvmlShutdown", (void *)&resp->ch.shutdownFn}, {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle}, {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, + {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount}, }; for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) { @@ -52,7 +55,7 @@ void cuda_init(cuda_init_resp_t *resp) { return; } - for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list + for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); if (!l[i].p) { UNLOAD_LIBRARY(resp->ch.handle); @@ -89,22 +92,34 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { return; } - // TODO - handle multiple GPUs - ret = (*h.getHandle)(0, &device); + unsigned int devices; + ret = (*h.getCount)(&devices); if (ret != NVML_SUCCESS) { - snprintf(buf, buflen, "unable to get device handle: %d", ret); + snprintf(buf, buflen, "unable to get device count: %d", ret); resp->err = strdup(buf); return; } - ret = (*h.getMemInfo)(device, &memInfo); - if (ret != NVML_SUCCESS) { - snprintf(buf, buflen, "device memory info lookup failure: %d", ret); - resp->err = strdup(buf); - return; + resp->total = 0; + resp->free = 0; + + for (i = 0; i < devices; i++) { + ret = (*h.getHandle)(i, &device); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret); + resp->err = strdup(buf); + return; + } + + ret = (*h.getMemInfo)(device, &memInfo); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret); + resp->err = strdup(buf); + return; + } + + resp->total += memInfo.total; + resp->free += memInfo.free; } - resp->total = memInfo.total; - resp->free = memInfo.free; - return; } #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h index 7d13cb6a..9a66a735 100644 --- a/gpu/gpu_info_cuda.h +++ b/gpu/gpu_info_cuda.h @@ -21,6 +21,7 @@ typedef struct cuda_handle { nvmlReturn_t (*shutdownFn)(void); nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); + nvmlReturn_t (*getCount)(unsigned int *); } cuda_handle_t; typedef struct cuda_init_resp {