From 089daaeabcc6f05ee5ad171dd123b66d3572efaa Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 30 Apr 2024 16:42:48 -0700 Subject: [PATCH] Add CUDA Driver API for GPU discovery We're seeing some corner cases with cudart which might be resolved by switching to the driver API which comes bundled with the driver package --- gpu/gpu.go | 66 +++++++++++++- gpu/gpu_info.h | 1 + gpu/gpu_info_cudart.h | 6 +- gpu/gpu_info_nvcuda.c | 203 ++++++++++++++++++++++++++++++++++++++++++ gpu/gpu_info_nvcuda.h | 71 +++++++++++++++ 5 files changed, 342 insertions(+), 5 deletions(-) create mode 100644 gpu/gpu_info_nvcuda.c create mode 100644 gpu/gpu_info_nvcuda.h diff --git a/gpu/gpu.go b/gpu/gpu.go index 9b915015..35c8d5ad 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -26,6 +26,7 @@ import ( type handles struct { deviceCount int cudart *C.cudart_handle_t + nvcuda *C.nvcuda_handle_t } const ( @@ -62,6 +63,22 @@ var CudartWindowsGlobs = []string{ "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", } +var NvcudaLinuxGlobs = []string{ + "/usr/local/cuda*/targets/*/lib/libcuda.so*", + "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*", + "/usr/lib/*-linux-gnu/libcuda.so*", + "/usr/lib/wsl/lib/libcuda.so*", + "/usr/lib/wsl/drivers/*/libcuda.so*", + "/opt/cuda/lib*/libcuda.so*", + "/usr/local/cuda/lib*/libcuda.so*", + "/usr/lib*/libcuda.so*", + "/usr/local/lib*/libcuda.so*", +} + +var NvcudaWindowsGlobs = []string{ + "c:\\windows\\system*\\nvcuda.dll", +} + // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. var CudaTegra string = os.Getenv("JETSON_JETPACK") @@ -74,6 +91,8 @@ func initGPUHandles() *handles { gpuHandles := &handles{} var cudartMgmtName string var cudartMgmtPatterns []string + var nvcudaMgmtName string + var nvcudaMgmtPatterns []string tmpDir, _ := PayloadsDir() switch runtime.GOOS { @@ -82,6 +101,9 @@ func initGPUHandles() *handles { localAppData := os.Getenv("LOCALAPPDATA") cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)} cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...) + // Aligned with driver, we can't carry as payloads + nvcudaMgmtName = "nvcuda.dll" + nvcudaMgmtPatterns = NvcudaWindowsGlobs case "linux": cudartMgmtName = "libcudart.so*" if tmpDir != "" { @@ -89,11 +111,25 @@ func initGPUHandles() *handles { cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)} } cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) + // Aligned with driver, we can't carry as payloads + nvcudaMgmtName = "libcuda.so*" + nvcudaMgmtPatterns = NvcudaLinuxGlobs default: return gpuHandles } slog.Info("Detecting GPUs") + nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) + if len(nvcudaLibPaths) > 0 { + deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) + if nvcuda != nil { + slog.Info("detected GPUs", "count", deviceCount, "library", libPath) + gpuHandles.nvcuda = nvcuda + gpuHandles.deviceCount = deviceCount + return gpuHandles + } + } + cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns) if len(cudartLibPaths) > 0 { deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) @@ -118,6 +154,9 @@ func GetGPUInfo() GpuInfoList { if gpuHandles.cudart != nil { C.cudart_release(*gpuHandles.cudart) } + if gpuHandles.nvcuda != nil { + C.nvcuda_release(*gpuHandles.nvcuda) + } }() // All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX @@ -138,7 +177,11 @@ func GetGPUInfo() GpuInfoList { gpuInfo := GpuInfo{ Library: "cuda", } - C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) + if gpuHandles.cudart != nil { + C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) + } else { + C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) + } if memInfo.err != nil { slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) C.free(unsafe.Pointer(memInfo.err)) @@ -196,9 +239,10 @@ func GetCPUMem() (memInfo, error) { return ret, nil } -func FindGPULibs(baseLibName string, patterns []string) []string { +func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string + var patterns []string gpuLibPaths := []string{} slog.Debug("Searching for GPU library", "name", baseLibName) @@ -218,6 +262,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string { } patterns = append(patterns, filepath.Join(d, baseLibName+"*")) } + patterns = append(patterns, defaultPatterns...) slog.Debug("gpu library search", "globs", patterns) for _, pattern := range patterns { // Ignore glob discovery errors @@ -267,6 +312,23 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) { return 0, nil, "" } +func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { + var resp C.nvcuda_init_resp_t + resp.ch.verbose = getVerboseState() + for _, libPath := range nvcudaLibPaths { + lib := C.CString(libPath) + defer C.free(unsafe.Pointer(lib)) + C.nvcuda_init(lib, &resp) + if resp.err != nil { + slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + } else { + return int(resp.num_devices), &resp.ch, libPath + } + } + return 0, nil, "" +} + func getVerboseState() C.uint16_t { if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { return C.uint16_t(1) diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 0f67442f..577bd3f0 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -58,6 +58,7 @@ void cpu_check_ram(mem_info_t *resp); #endif #include "gpu_info_cudart.h" +#include "gpu_info_nvcuda.h" #endif // __GPU_INFO_H__ #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cudart.h b/gpu/gpu_info_cudart.h index ae2579a2..e8a89856 100644 --- a/gpu/gpu_info_cudart.h +++ b/gpu/gpu_info_cudart.h @@ -6,9 +6,9 @@ // Just enough typedef's to dlopen/dlsym for memory information typedef enum cudartReturn_enum { CUDART_SUCCESS = 0, - CUDA_ERROR_INVALID_VALUE = 1, - CUDA_ERROR_MEMORY_ALLOCATION = 2, - CUDA_ERROR_INSUFFICIENT_DRIVER = 35, + CUDART_ERROR_INVALID_VALUE = 1, + CUDART_ERROR_MEMORY_ALLOCATION = 2, + CUDART_ERROR_INSUFFICIENT_DRIVER = 35, // Other values omitted for now... } cudartReturn_t; diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c new file mode 100644 index 00000000..e192d2e6 --- /dev/null +++ b/gpu/gpu_info_nvcuda.c @@ -0,0 +1,203 @@ +#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? + +#include +#include "gpu_info_nvcuda.h" + +void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { + CUresult ret; + resp->err = NULL; + resp->num_devices = 0; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + struct lookup { + char *s; + void **p; + } l[] = { + + {"cuInit", (void *)&resp->ch.cuInit}, + {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion}, + {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount}, + {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet}, + {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute}, + {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid}, + {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3}, + {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2}, + {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy}, + {NULL, NULL}, + }; + + resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY); + if (!resp->ch.handle) { + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg); + snprintf(buf, buflen, + "Unable to load %s library to query for Nvidia GPUs: %s", + nvcuda_lib_path, msg); + free(msg); + resp->err = strdup(buf); + return; + } + + for (i = 0; l[i].s != NULL; i++) { + *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); + if (!*l[i].p) { + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + msg); + free(msg); + resp->err = strdup(buf); + return; + } + } + + ret = (*resp->ch.cuInit)(0); + if (ret != CUDA_SUCCESS) { + LOG(resp->ch.verbose, "cuInit err: %d\n", ret); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { + resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"); + return; + } + snprintf(buf, buflen, "nvcuda init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + int version = 0; + nvcudaDriverVersion_t driverVersion; + driverVersion.major = 0; + driverVersion.minor = 0; + + // Report driver version if we're in verbose mode, ignore errors + ret = (*resp->ch.cuDriverGetVersion)(&version); + if (ret != CUDA_SUCCESS) { + LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); + } else { + driverVersion.major = version / 1000; + driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; + LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); + } + + ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); + if (ret != CUDA_SUCCESS) { + LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "unable to get device count: %d", ret); + resp->err = strdup(buf); + return; + } +} + +const int buflen = 256; +void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) { + resp->err = NULL; + nvcudaMemory_t memInfo = {0,0}; + CUresult ret; + CUdevice device = -1; + CUcontext ctx = NULL; + char buf[buflen + 1]; + CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + if (h.handle == NULL) { + resp->err = strdup("nvcuda handle isn't initialized"); + return; + } + + ret = (*h.cuDeviceGet)(&device, i); + if (ret != CUDA_SUCCESS) { + snprintf(buf, buflen, "nvcuda device failed to initialize"); + resp->err = strdup(buf); + return; + } + + resp->major = 0; + resp->minor = 0; + int major = 0; + int minor = 0; + ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret); + } else { + ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret); + } else { + resp->minor = minor; + resp->major = major; + } + } + + ret = (*h.cuDeviceGetUuid)(&uuid, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret); + snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i); + } else { + // GPU-d110a105-ac29-1d54-7b49-9c90440f215b + snprintf(&resp->gpu_id[0], GPU_ID_LEN, + "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid.bytes[0], + uuid.bytes[1], + uuid.bytes[2], + uuid.bytes[3], + uuid.bytes[4], + uuid.bytes[5], + uuid.bytes[6], + uuid.bytes[7], + uuid.bytes[8], + uuid.bytes[9], + uuid.bytes[10], + uuid.bytes[11], + uuid.bytes[12], + uuid.bytes[13], + uuid.bytes[14], + uuid.bytes[15] + ); + } + + // To get memory we have to set (and release) a context + ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); + if (ret != CUDA_SUCCESS) { + snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret); + resp->err = strdup(buf); + return; + } + + ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total); + if (ret != CUDA_SUCCESS) { + snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret); + resp->err = strdup(buf); + // Best effort on failure... + (*h.cuCtxDestroy)(ctx); + return; + } + + resp->total = memInfo.total; + resp->free = memInfo.free; + + LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024); + LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024); + LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); + + + + ret = (*h.cuCtxDestroy)(ctx); + if (ret != CUDA_SUCCESS) { + LOG(1, "nvcuda failed to release primary device context %d", ret); + } +} + +void nvcuda_release(nvcuda_handle_t h) { + LOG(h.verbose, "releasing nvcuda library\n"); + UNLOAD_LIBRARY(h.handle); + // TODO and other context release logic? + h.handle = NULL; +} + +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h new file mode 100644 index 00000000..c4d94edd --- /dev/null +++ b/gpu/gpu_info_nvcuda.h @@ -0,0 +1,71 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_NVCUDA_H__ +#define __GPU_INFO_NVCUDA_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum cudaError_enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, + CUDA_ERROR_MEMORY_ALLOCATION = 2, + CUDA_ERROR_NOT_INITIALIZED = 3, + CUDA_ERROR_INSUFFICIENT_DRIVER = 35, + // Other values omitted for now... +} CUresult; + +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, + + // TODO - not yet wired up but may be useful for Jetson or other + // integrated GPU scenarios with shared memory + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18 + +} CUdevice_attribute; + +typedef void *nvcudaDevice_t; // Opaque is sufficient +typedef struct nvcudaMemory_st { + uint64_t total; + uint64_t free; +} nvcudaMemory_t; + +typedef struct nvcudaDriverVersion { + int major; + int minor; +} nvcudaDriverVersion_t; + +typedef struct CUuuid_st { + unsigned char bytes[16]; +} CUuuid; + +typedef int CUdevice; +typedef void* CUcontext; + +typedef struct nvcuda_handle { + void *handle; + uint16_t verbose; + CUresult (*cuInit)(unsigned int Flags); + CUresult (*cuDriverGetVersion)(int *driverVersion); + CUresult (*cuDeviceGetCount)(int *); + CUresult (*cuDeviceGet)(CUdevice* device, int ordinal); + CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); + CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2 + + // Context specific aspects + CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev); + CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total); + CUresult (*cuCtxDestroy)(CUcontext ctx); +} nvcuda_handle_t; + +typedef struct nvcuda_init_resp { + char *err; // If err is non-null handle is invalid + nvcuda_handle_t ch; + int num_devices; +} nvcuda_init_resp_t; + +void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp); +void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp); +void nvcuda_release(nvcuda_handle_t ch); + +#endif // __GPU_INFO_NVCUDA_H__ +#endif // __APPLE__