From f6f759fc5fb4868125b8a25c28ce96d2c0980ef7 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 9 Jul 2024 10:27:53 -0700 Subject: [PATCH] Detect CUDA OS Overhead This adds logic to detect skew between the driver and management library which can be attributed to OS overhead and records that so we can adjust subsequent management library free VRAM updates and avoid OOM scenarios. --- gpu/gpu.go | 27 +++++++++++++++++++++++++++ gpu/types.go | 3 ++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 29a3c103..58144991 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -274,6 +274,28 @@ func GetGPUInfo() GpuInfoList { gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor + // query the management library as well so we can record any skew between the two + // which represents overhead on the GPU we must set aside on subsequent updates + if cHandles.nvml != nil { + C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used) + if memInfo.err != nil { + slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) + C.free(unsafe.Pointer(memInfo.err)) + } else { + if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory { + gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory + slog.Info("detected OS VRAM overhead", + "id", gpuInfo.ID, + "library", gpuInfo.Library, + "compute", gpuInfo.Compute, + "driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor), + "name", gpuInfo.Name, + "overhead", format.HumanBytes2(gpuInfo.OSOverhead), + ) + } + } + } + // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... cudaGPUs = append(cudaGPUs, gpuInfo) } @@ -374,9 +396,14 @@ func GetGPUInfo() GpuInfoList { slog.Warn("error looking up nvidia GPU memory") continue } + if cHandles.nvml != nil && gpu.OSOverhead > 0 { + // When using the management library update based on recorded overhead + memInfo.free -= C.uint64_t(gpu.OSOverhead) + } slog.Debug("updating cuda memory data", "gpu", gpu.ID, "name", gpu.Name, + "overhead", format.HumanBytes2(gpu.OSOverhead), slog.Group( "before", "total", format.HumanBytes2(gpu.TotalMemory), diff --git a/gpu/types.go b/gpu/types.go index 2eaa9bae..7a7749b8 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -52,7 +52,8 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - index int //nolint:unused,nolintlint + OSOverhead uint64 // Memory overhead between the driver library and management library + index int //nolint:unused,nolintlint } type CudaGPUInfoList []CudaGPUInfo