diff --git a/gpu/gpu.go b/gpu/gpu.go index d7a3ba44..988c20af 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -28,6 +28,7 @@ type cudaHandles struct { deviceCount int cudart *C.cudart_handle_t nvcuda *C.nvcuda_handle_t + nvml *C.nvml_handle_t } type oneapiHandles struct { @@ -50,6 +51,7 @@ var ( nvcudaLibPath string cudartLibPath string oneapiLibPath string + nvmlLibPath string rocmGPUs []RocmGPUInfo oneapiGPUs []OneapiGPUInfo ) @@ -81,6 +83,10 @@ var CudartWindowsGlobs = []string{ "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", } +var NvmlWindowsGlobs = []string{ + "c:\\Windows\\System32\\nvml.dll", +} + var NvcudaLinuxGlobs = []string{ "/usr/local/cuda*/targets/*/lib/libcuda.so*", "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*", @@ -117,6 +123,10 @@ func initCudaHandles() *cudaHandles { cHandles := &cudaHandles{} // Short Circuit if we already know which library to use + if nvmlLibPath != "" { + cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath}) + return cHandles + } if nvcudaLibPath != "" { cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath}) return cHandles @@ -131,6 +141,8 @@ func initCudaHandles() *cudaHandles { var cudartMgmtPatterns []string var nvcudaMgmtName string var nvcudaMgmtPatterns []string + var nvmlMgmtName string + var nvmlMgmtPatterns []string tmpDir, _ := PayloadsDir() switch runtime.GOOS { @@ -142,6 +154,12 @@ func initCudaHandles() *cudaHandles { // Aligned with driver, we can't carry as payloads nvcudaMgmtName = "nvcuda.dll" nvcudaMgmtPatterns = NvcudaWindowsGlobs + + // Use nvml to refresh free memory on windows only + nvmlMgmtName = "nvml.dll" + nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs)) + copy(nvmlMgmtPatterns, NvmlWindowsGlobs) + case "linux": cudartMgmtName = "libcudart.so*" if tmpDir != "" { @@ -152,10 +170,24 @@ func initCudaHandles() *cudaHandles { // Aligned with driver, we can't carry as payloads nvcudaMgmtName = "libcuda.so*" nvcudaMgmtPatterns = NvcudaLinuxGlobs + + // nvml omitted on linux default: return cHandles } + if len(nvmlMgmtPatterns) > 0 { + nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns) + if len(nvmlLibPaths) > 0 { + nvml, libPath := LoadNVMLMgmt(nvmlLibPaths) + if nvml != nil { + slog.Debug("nvidia-ml loaded", "library", libPath) + cHandles.nvml = nvml + nvmlLibPath = libPath + } + } + } + nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) if len(nvcudaLibPaths) > 0 { deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) @@ -230,6 +262,9 @@ func GetGPUInfo() GpuInfoList { if cHandles.nvcuda != nil { C.nvcuda_release(*cHandles.nvcuda) } + if cHandles.nvml != nil { + C.nvml_release(*cHandles.nvml) + } } if oHandles != nil { if oHandles.oneapi != nil { @@ -365,10 +400,17 @@ func GetGPUInfo() GpuInfoList { cHandles = initCudaHandles() } for i, gpu := range cudaGPUs { - if cHandles.cudart != nil { + if cHandles.nvml != nil { + C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used) + } else if cHandles.cudart != nil { C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo) + } else if cHandles.nvcuda != nil { + C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total) + memInfo.used = memInfo.total - memInfo.free } else { - C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free) + // shouldn't happen + slog.Warn("no valid cuda library loaded to refresh vram usage") + break } if memInfo.err != nil { slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) @@ -379,7 +421,21 @@ func GetGPUInfo() GpuInfoList { slog.Warn("error looking up nvidia GPU memory") continue } - slog.Debug("updating cuda free memory", "gpu", gpu.ID, "name", gpu.Name, "before", format.HumanBytes2(gpu.FreeMemory), "now", format.HumanBytes2(uint64(memInfo.free))) + slog.Debug("updating cuda memory data", + "gpu", gpu.ID, + "name", gpu.Name, + slog.Group( + "before", + "total", format.HumanBytes2(gpu.TotalMemory), + "free", format.HumanBytes2(gpu.FreeMemory), + ), + slog.Group( + "now", + "total", format.HumanBytes2(uint64(memInfo.total)), + "free", format.HumanBytes2(uint64(memInfo.free)), + "used", format.HumanBytes2(uint64(memInfo.used)), + ), + ) cudaGPUs[i].FreeMemory = uint64(memInfo.free) } @@ -530,6 +586,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { return 0, nil, "" } +func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) { + var resp C.nvml_init_resp_t + resp.ch.verbose = getVerboseState() + for _, libPath := range nvmlLibPaths { + lib := C.CString(libPath) + defer C.free(unsafe.Pointer(lib)) + C.nvml_init(lib, &resp) + if resp.err != nil { + slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))) + C.free(unsafe.Pointer(resp.err)) + } else { + return &resp.ch, libPath + } + } + return nil, "" +} + func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) { var resp C.oneapi_init_resp_t num_devices := 0 diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 482b81a6..ab0952d9 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -47,6 +47,7 @@ typedef struct mem_info { char gpu_name[GPU_NAME_LEN]; uint64_t total; uint64_t free; + uint64_t used; // Compute Capability int major; @@ -62,6 +63,7 @@ void cpu_check_ram(mem_info_t *resp); #include "gpu_info_cudart.h" #include "gpu_info_nvcuda.h" +#include "gpu_info_nvml.h" #include "gpu_info_oneapi.h" #endif // __GPU_INFO_H__ diff --git a/gpu/gpu_info_cudart.c b/gpu/gpu_info_cudart.c index 12963825..9db89529 100644 --- a/gpu/gpu_info_cudart.c +++ b/gpu/gpu_info_cudart.c @@ -166,9 +166,11 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) { resp->total = memInfo.total; resp->free = memInfo.free; + resp->used = memInfo.used; LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total); LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free); + LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used); LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); } diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c index f48b0389..675ce5cc 100644 --- a/gpu/gpu_info_nvcuda.c +++ b/gpu/gpu_info_nvcuda.c @@ -197,12 +197,12 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { } } -void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) { +void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) { CUresult ret; CUcontext ctx = NULL; CUdevice device = -1; *free = 0; - uint64_t total = 0; + *total = 0; ret = (*h.cuDeviceGet)(&device, i); if (ret != CUDA_SUCCESS) { @@ -218,7 +218,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) { return; } - ret = (*h.cuMemGetInfo_v2)(free, &total); + ret = (*h.cuMemGetInfo_v2)(free, total); if (ret != CUDA_SUCCESS) { LOG(1, "nvcuda device memory info lookup failure %d", ret); // Best effort on failure... diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h index 87429894..f9654f64 100644 --- a/gpu/gpu_info_nvcuda.h +++ b/gpu/gpu_info_nvcuda.h @@ -68,7 +68,7 @@ typedef struct nvcuda_init_resp { void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp); void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp); -void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free); +void nvcuda_get_free(nvcuda_handle_t ch, int device_id, uint64_t *free, uint64_t *total); void nvcuda_release(nvcuda_handle_t ch); #endif // __GPU_INFO_NVCUDA_H__ diff --git a/gpu/gpu_info_nvml.c b/gpu/gpu_info_nvml.c new file mode 100644 index 00000000..e152a45c --- /dev/null +++ b/gpu/gpu_info_nvml.c @@ -0,0 +1,112 @@ +#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? + +#include + +#include "gpu_info_nvml.h" + +void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) { + nvmlReturn_t ret; + resp->err = NULL; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path); + + struct lookup { + char *s; + void **p; + } l[] = { + {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2}, + {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown}, + {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex}, + {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo}, + {NULL, NULL}, + }; + + resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY); + if (!resp->ch.handle) { + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg); + snprintf(buf, buflen, + "Unable to load %s library to query for Nvidia GPUs: %s", + nvml_lib_path, msg); + free(msg); + resp->err = strdup(buf); + return; + } + + // TODO once we've squashed the remaining corner cases remove this log +// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path); + + LOG(1, "XXX wiring functions nvml_init\n"); + + for (i = 0; l[i].s != NULL; i++) { + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); + + *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); + if (!l[i].p) { + resp->ch.handle = NULL; + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->ch.handle); + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + msg); + free(msg); + resp->err = strdup(buf); + return; + } + } + LOG(1, "XXX calling init_v2\n"); + + ret = (*resp->ch.nvmlInit_v2)(); + if (ret != NVML_SUCCESS) { + LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "nvml vram init failure: %d", ret); + resp->err = strdup(buf); + return; + } + LOG(1, "XXX nvml_init done\n"); + +} + + +void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) { + nvmlDevice_t device; + nvmlMemory_t memInfo = {0}; + nvmlReturn_t ret; + LOG(1, "XXX in nvml_get_free\n"); + ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device); + if (ret != NVML_SUCCESS) { + LOG(1, "unable to get device handle %d: %d", device_id, ret); + *free = 0; + return; + } + + ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo); + if (ret != NVML_SUCCESS) { + LOG(1, "device memory info lookup failure %d: %d", device_id, ret); + *free = 0; + return; + } + *free = memInfo.free; + *total = memInfo.total; + *used = memInfo.used; +} + + +void nvml_release(nvml_handle_t h) { + LOG(h.verbose, "releasing nvml library\n"); + nvmlReturn_t ret; + ret = (*h.nvmlShutdown)(); + if (ret != NVML_SUCCESS) { + LOG(1, "error during nvmlShutdown %d", ret); + } + UNLOAD_LIBRARY(h.handle); + h.handle = NULL; +} + +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_nvml.h b/gpu/gpu_info_nvml.h new file mode 100644 index 00000000..a661f723 --- /dev/null +++ b/gpu/gpu_info_nvml.h @@ -0,0 +1,48 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_NVML_H__ +#define __GPU_INFO_NVML_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum nvmlReturn_enum { + NVML_SUCCESS = 0, + // Other values omitted for now... +} nvmlReturn_t; +typedef void *nvmlDevice_t; // Opaque is sufficient +typedef struct nvmlMemory_st { + unsigned long long total; + unsigned long long free; + unsigned long long used; +} nvmlMemory_t; + +typedef enum nvmlBrandType_enum +{ + NVML_BRAND_UNKNOWN = 0, +} nvmlBrandType_t; + +typedef struct nvml_handle { + void *handle; + uint16_t verbose; + nvmlReturn_t (*nvmlInit_v2)(void); + nvmlReturn_t (*nvmlShutdown)(void); + nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *); + nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *); +} nvml_handle_t; + +typedef struct nvml_init_resp { + char *err; // If err is non-null handle is invalid + nvml_handle_t ch; +} nvml_init_resp_t; + +typedef struct nvml_compute_capability { + char *err; + int major; + int minor; +} nvml_compute_capability_t; + +void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp); +void nvml_get_free(nvml_handle_t ch, int device_id, uint64_t *free, uint64_t *total, uint64_t *used); +void nvml_release(nvml_handle_t ch); + +#endif // __GPU_INFO_NVML_H__ +#endif // __APPLE__ \ No newline at end of file diff --git a/server/sched.go b/server/sched.go index 9eb4430e..f6f12a67 100644 --- a/server/sched.go +++ b/server/sched.go @@ -487,8 +487,10 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool func (runner *runnerRef) waitForVRAMRecovery() chan interface{} { finished := make(chan interface{}, 1) - // CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space - if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" { + // CPU or Metal don't need checking, so no waiting required + // windows can page VRAM, only cuda currently can report accurate used vram usage + if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || + (runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") { finished <- struct{}{} return finished }