From df325373126485ef37bfb23c1ccf59e4beec355a Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Fri, 5 Jan 2024 11:25:58 -0500
Subject: [PATCH] gpu: read memory info from all cuda devices (#1802)

* gpu: read memory info from all cuda devices

* add `LOOKUP_SIZE` constant

* better constant name

* address comments
---
 gpu/gpu_info_cuda.c | 41 ++++++++++++++++++++++++++++-------------
 gpu/gpu_info_cuda.h |  1 +
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c
index 52738710..c81293aa 100644
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -20,6 +20,8 @@ const char *cuda_lib_paths[] = {
 };
 #endif
 
+#define CUDA_LOOKUP_SIZE 5
+
 void cuda_init(cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
   resp->err = NULL;
@@ -30,11 +32,12 @@ void cuda_init(cuda_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[4] = {
+  } l[CUDA_LOOKUP_SIZE] = {
       {"nvmlInit_v2", (void *)&resp->ch.initFn},
       {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
       {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
+      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
   };
 
   for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
@@ -52,7 +55,7 @@ void cuda_init(cuda_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
+  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
     if (!l[i].p) {
       UNLOAD_LIBRARY(resp->ch.handle);
@@ -89,22 +92,34 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
     return;
   }
 
-  // TODO - handle multiple GPUs
-  ret = (*h.getHandle)(0, &device);
+  unsigned int devices;
+  ret = (*h.getCount)(&devices);
   if (ret != NVML_SUCCESS) {
-    snprintf(buf, buflen, "unable to get device handle: %d", ret);
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
     return;
   }
 
-  ret = (*h.getMemInfo)(device, &memInfo);
-  if (ret != NVML_SUCCESS) {
-    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
+  resp->total = 0;
+  resp->free = 0;
+
+  for (i = 0; i < devices; i++) {
+    ret = (*h.getHandle)(i, &device);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+
+    ret = (*h.getMemInfo)(device, &memInfo);
+    if (ret != NVML_SUCCESS) {
+      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
+      resp->err = strdup(buf);
+      return;
+    }
+
+    resp->total += memInfo.total;
+    resp->free += memInfo.free;
   }
-  resp->total = memInfo.total;
-  resp->free = memInfo.free;
-  return;
 }
 #endif  // __APPLE__
\ No newline at end of file
diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h
index 7d13cb6a..9a66a735 100644
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -21,6 +21,7 @@ typedef struct cuda_handle {
   nvmlReturn_t (*shutdownFn)(void);
   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
+  nvmlReturn_t (*getCount)(unsigned int *);
 } cuda_handle_t;
 
 typedef struct cuda_init_resp {