diff --git a/gpu/gpu.go b/gpu/gpu.go index 9b915015..35c8d5ad 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -26,6 +26,7 @@ import ( type handles struct { deviceCount int cudart *C.cudart_handle_t + nvcuda *C.nvcuda_handle_t } const ( @@ -62,6 +63,22 @@ var CudartWindowsGlobs = []string{ "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", } +var NvcudaLinuxGlobs = []string{ + "/usr/local/cuda*/targets/*/lib/libcuda.so*", + "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*", + "/usr/lib/*-linux-gnu/libcuda.so*", + "/usr/lib/wsl/lib/libcuda.so*", + "/usr/lib/wsl/drivers/*/libcuda.so*", + "/opt/cuda/lib*/libcuda.so*", + "/usr/local/cuda/lib*/libcuda.so*", + "/usr/lib*/libcuda.so*", + "/usr/local/lib*/libcuda.so*", +} + +var NvcudaWindowsGlobs = []string{ + "c:\\windows\\system*\\nvcuda.dll", +} + // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. var CudaTegra string = os.Getenv("JETSON_JETPACK") @@ -74,6 +91,8 @@ func initGPUHandles() *handles { gpuHandles := &handles{} var cudartMgmtName string var cudartMgmtPatterns []string + var nvcudaMgmtName string + var nvcudaMgmtPatterns []string tmpDir, _ := PayloadsDir() switch runtime.GOOS { @@ -82,6 +101,9 @@ func initGPUHandles() *handles { localAppData := os.Getenv("LOCALAPPDATA") cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)} cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...) + // Aligned with driver, we can't carry as payloads + nvcudaMgmtName = "nvcuda.dll" + nvcudaMgmtPatterns = NvcudaWindowsGlobs case "linux": cudartMgmtName = "libcudart.so*" if tmpDir != "" { @@ -89,11 +111,25 @@ func initGPUHandles() *handles { cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)} } cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) + // Aligned with driver, we can't carry as payloads + nvcudaMgmtName = "libcuda.so*" + nvcudaMgmtPatterns = NvcudaLinuxGlobs default: return gpuHandles } slog.Info("Detecting GPUs") + nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) + if len(nvcudaLibPaths) > 0 { + deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) + if nvcuda != nil { + slog.Info("detected GPUs", "count", deviceCount, "library", libPath) + gpuHandles.nvcuda = nvcuda + gpuHandles.deviceCount = deviceCount + return gpuHandles + } + } + cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns) if len(cudartLibPaths) > 0 { deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) @@ -118,6 +154,9 @@ func GetGPUInfo() GpuInfoList { if gpuHandles.cudart != nil { C.cudart_release(*gpuHandles.cudart) } + if gpuHandles.nvcuda != nil { + C.nvcuda_release(*gpuHandles.nvcuda) + } }() // All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX @@ -138,7 +177,11 @@ func GetGPUInfo() GpuInfoList { gpuInfo := GpuInfo{ Library: "cuda", } - C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) + if gpuHandles.cudart != nil { + C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo) + } else { + C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo) + } if memInfo.err != nil { slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) C.free(unsafe.Pointer(memInfo.err)) @@ -196,9 +239,10 @@ func GetCPUMem() (memInfo, error) { return ret, nil } -func FindGPULibs(baseLibName string, patterns []string) []string { +func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string + var patterns []string gpuLibPaths := []string{} slog.Debug("Searching for GPU library", "name", baseLibName) @@ -218,6 +262,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string { } patterns = append(patterns, filepath.Join(d, baseLibName+"*")) } + patterns = append(patterns, defaultPatterns...) slog.Debug("gpu library search", "globs", patterns) for _, pattern := range patterns { // Ignore glob discovery errors @@ -267,6 +312,23 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) { return 0, nil, "" } +func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { + var resp C.nvcuda_init_resp_t + resp.ch.verbose = getVerboseState() + for _, libPath := range nvcudaLibPaths { + lib := C.CString(libPath) + defer C.free(unsafe.Pointer(lib)) + C.nvcuda_init(lib, &resp) + if resp.err != nil { + slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + } else { + return int(resp.num_devices), &resp.ch, libPath + } + } + return 0, nil, "" +} + func getVerboseState() C.uint16_t { if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" { return C.uint16_t(1) diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index 0f67442f..577bd3f0 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -58,6 +58,7 @@ void cpu_check_ram(mem_info_t *resp); #endif #include "gpu_info_cudart.h" +#include "gpu_info_nvcuda.h" #endif // __GPU_INFO_H__ #endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cudart.h b/gpu/gpu_info_cudart.h index ae2579a2..e8a89856 100644 --- a/gpu/gpu_info_cudart.h +++ b/gpu/gpu_info_cudart.h @@ -6,9 +6,9 @@ // Just enough typedef's to dlopen/dlsym for memory information typedef enum cudartReturn_enum { CUDART_SUCCESS = 0, - CUDA_ERROR_INVALID_VALUE = 1, - CUDA_ERROR_MEMORY_ALLOCATION = 2, - CUDA_ERROR_INSUFFICIENT_DRIVER = 35, + CUDART_ERROR_INVALID_VALUE = 1, + CUDART_ERROR_MEMORY_ALLOCATION = 2, + CUDART_ERROR_INSUFFICIENT_DRIVER = 35, // Other values omitted for now... } cudartReturn_t; diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c new file mode 100644 index 00000000..e192d2e6 --- /dev/null +++ b/gpu/gpu_info_nvcuda.c @@ -0,0 +1,203 @@ +#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? + +#include +#include "gpu_info_nvcuda.h" + +void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { + CUresult ret; + resp->err = NULL; + resp->num_devices = 0; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + struct lookup { + char *s; + void **p; + } l[] = { + + {"cuInit", (void *)&resp->ch.cuInit}, + {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion}, + {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount}, + {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet}, + {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute}, + {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid}, + {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3}, + {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2}, + {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy}, + {NULL, NULL}, + }; + + resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY); + if (!resp->ch.handle) { + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg); + snprintf(buf, buflen, + "Unable to load %s library to query for Nvidia GPUs: %s", + nvcuda_lib_path, msg); + free(msg); + resp->err = strdup(buf); + return; + } + + for (i = 0; l[i].s != NULL; i++) { + *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); + if (!*l[i].p) { + char *msg = LOAD_ERR(); + LOG(resp->ch.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + msg); + free(msg); + resp->err = strdup(buf); + return; + } + } + + ret = (*resp->ch.cuInit)(0); + if (ret != CUDA_SUCCESS) { + LOG(resp->ch.verbose, "cuInit err: %d\n", ret); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { + resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"); + return; + } + snprintf(buf, buflen, "nvcuda init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + int version = 0; + nvcudaDriverVersion_t driverVersion; + driverVersion.major = 0; + driverVersion.minor = 0; + + // Report driver version if we're in verbose mode, ignore errors + ret = (*resp->ch.cuDriverGetVersion)(&version); + if (ret != CUDA_SUCCESS) { + LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret); + } else { + driverVersion.major = version / 1000; + driverVersion.minor = (version - (driverVersion.major * 1000)) / 10; + LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor); + } + + ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices); + if (ret != CUDA_SUCCESS) { + LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret); + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "unable to get device count: %d", ret); + resp->err = strdup(buf); + return; + } +} + +const int buflen = 256; +void nvcuda_check_vram(nvcuda_handle_t h, int i, mem_info_t *resp) { + resp->err = NULL; + nvcudaMemory_t memInfo = {0,0}; + CUresult ret; + CUdevice device = -1; + CUcontext ctx = NULL; + char buf[buflen + 1]; + CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + + if (h.handle == NULL) { + resp->err = strdup("nvcuda handle isn't initialized"); + return; + } + + ret = (*h.cuDeviceGet)(&device, i); + if (ret != CUDA_SUCCESS) { + snprintf(buf, buflen, "nvcuda device failed to initialize"); + resp->err = strdup(buf); + return; + } + + resp->major = 0; + resp->minor = 0; + int major = 0; + int minor = 0; + ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret); + } else { + ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret); + } else { + resp->minor = minor; + resp->major = major; + } + } + + ret = (*h.cuDeviceGetUuid)(&uuid, device); + if (ret != CUDA_SUCCESS) { + LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret); + snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i); + } else { + // GPU-d110a105-ac29-1d54-7b49-9c90440f215b + snprintf(&resp->gpu_id[0], GPU_ID_LEN, + "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + uuid.bytes[0], + uuid.bytes[1], + uuid.bytes[2], + uuid.bytes[3], + uuid.bytes[4], + uuid.bytes[5], + uuid.bytes[6], + uuid.bytes[7], + uuid.bytes[8], + uuid.bytes[9], + uuid.bytes[10], + uuid.bytes[11], + uuid.bytes[12], + uuid.bytes[13], + uuid.bytes[14], + uuid.bytes[15] + ); + } + + // To get memory we have to set (and release) a context + ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); + if (ret != CUDA_SUCCESS) { + snprintf(buf, buflen, "nvcuda failed to get primary device context %d", ret); + resp->err = strdup(buf); + return; + } + + ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total); + if (ret != CUDA_SUCCESS) { + snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret); + resp->err = strdup(buf); + // Best effort on failure... + (*h.cuCtxDestroy)(ctx); + return; + } + + resp->total = memInfo.total; + resp->free = memInfo.free; + + LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024); + LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024); + LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor); + + + + ret = (*h.cuCtxDestroy)(ctx); + if (ret != CUDA_SUCCESS) { + LOG(1, "nvcuda failed to release primary device context %d", ret); + } +} + +void nvcuda_release(nvcuda_handle_t h) { + LOG(h.verbose, "releasing nvcuda library\n"); + UNLOAD_LIBRARY(h.handle); + // TODO and other context release logic? + h.handle = NULL; +} + +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h new file mode 100644 index 00000000..c4d94edd --- /dev/null +++ b/gpu/gpu_info_nvcuda.h @@ -0,0 +1,71 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_NVCUDA_H__ +#define __GPU_INFO_NVCUDA_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum cudaError_enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, + CUDA_ERROR_MEMORY_ALLOCATION = 2, + CUDA_ERROR_NOT_INITIALIZED = 3, + CUDA_ERROR_INSUFFICIENT_DRIVER = 35, + // Other values omitted for now... +} CUresult; + +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, + + // TODO - not yet wired up but may be useful for Jetson or other + // integrated GPU scenarios with shared memory + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18 + +} CUdevice_attribute; + +typedef void *nvcudaDevice_t; // Opaque is sufficient +typedef struct nvcudaMemory_st { + uint64_t total; + uint64_t free; +} nvcudaMemory_t; + +typedef struct nvcudaDriverVersion { + int major; + int minor; +} nvcudaDriverVersion_t; + +typedef struct CUuuid_st { + unsigned char bytes[16]; +} CUuuid; + +typedef int CUdevice; +typedef void* CUcontext; + +typedef struct nvcuda_handle { + void *handle; + uint16_t verbose; + CUresult (*cuInit)(unsigned int Flags); + CUresult (*cuDriverGetVersion)(int *driverVersion); + CUresult (*cuDeviceGetCount)(int *); + CUresult (*cuDeviceGet)(CUdevice* device, int ordinal); + CUresult (*cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev); + CUresult (*cuDeviceGetUuid)(CUuuid* uuid, CUdevice dev); // signature compatible with cuDeviceGetUuid_v2 + + // Context specific aspects + CUresult (*cuCtxCreate_v3)(CUcontext* pctx, void *params, int len, unsigned int flags, CUdevice dev); + CUresult (*cuMemGetInfo_v2)(uint64_t* free, uint64_t* total); + CUresult (*cuCtxDestroy)(CUcontext ctx); +} nvcuda_handle_t; + +typedef struct nvcuda_init_resp { + char *err; // If err is non-null handle is invalid + nvcuda_handle_t ch; + int num_devices; +} nvcuda_init_resp_t; + +void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp); +void nvcuda_check_vram(nvcuda_handle_t ch, int device_id, mem_info_t *resp); +void nvcuda_release(nvcuda_handle_t ch); + +#endif // __GPU_INFO_NVCUDA_H__ +#endif // __APPLE__