From ef757da2c90ad52f35c95688095dfd84655cceb7 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 3 Jul 2024 10:30:07 -0700 Subject: [PATCH] Better nvidia GPU discovery logging Refine the way we log GPU discovery to improve the non-debug output, and report more actionable log messages when possible to help users troubleshoot on their own. --- docs/troubleshooting.md | 14 +++++++++----- gpu/gpu.go | 23 +++++++++++++++++++++-- gpu/gpu_info_nvcuda.c | 31 ++++++++++++++++--------------- gpu/gpu_info_nvcuda.h | 6 +++++- 4 files changed, 51 insertions(+), 23 deletions(-) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index de29b344..bbb77183 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -70,14 +70,18 @@ curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION="0.1.29" sh If your system is configured with the "noexec" flag where Ollama stores its temporary executable files, you can specify an alternate location by setting OLLAMA_TMPDIR to a location writable by the user ollama runs as. For example OLLAMA_TMPDIR=/usr/share/ollama/ -## Container fails to run on NVIDIA GPU +## NVIDIA GPU Discovery -Make sure you've set up the container runtime first as described in [docker.md](./docker.md) +When Ollama starts up, it takes inventory of the GPUs present in the system to determine compatibility and how much VRAM is available. Sometimes this discovery can fail to find your GPUs. In general, running the latest driver will yield the best results. -Sometimes the container runtime can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem +### Linux NVIDIA Troubleshooting -- Is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU. -- Is the uvm driver not loaded? `sudo nvidia-modprobe -u` +If you are using a container to run Ollama, make sure you've set up the container runtime first as described in [docker.md](./docker.md) + +Sometimes the Ollama can have difficulties initializing the GPU. When you check the server logs, this can show up as various error codes, such as "3" (not initialized), "46" (device unavailable), "100" (no device), "999" (unknown), or others. The following troubleshooting techniques may help resolve the problem + +- If you are using a container, is the container runtime working? Try `docker run --gpus all ubuntu nvidia-smi` - if this doesn't work, Ollama wont be able to see your NVIDIA GPU. +- Is the uvm driver loaded? `sudo nvidia-modprobe -u` - Try reloading the nvidia_uvm driver - `sudo rmmod nvidia_uvm` then `sudo modprobe nvidia_uvm` - Try rebooting - Make sure you're running the latest nvidia drivers diff --git a/gpu/gpu.go b/gpu/gpu.go index 583bb79c..29a3c103 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -202,7 +202,7 @@ func GetGPUInfo() GpuInfoList { }() if !bootstrapped { - slog.Debug("Detecting GPUs") + slog.Info("looking for compatible GPUs") needRefresh = false cpuCapability = GetCPUCapability() var memInfo C.mem_info_t @@ -320,6 +320,9 @@ func GetGPUInfo() GpuInfoList { rocmGPUs = AMDGetGPUInfo() bootstrapped = true + if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 { + slog.Info("no compatible GPUs were discovered") + } } // For detected GPUs, load library if not loaded @@ -514,7 +517,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) { defer C.free(unsafe.Pointer(lib)) C.nvcuda_init(lib, &resp) if resp.err != nil { - slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err)) + // Decide what log level based on the type of error message to help users understand why + msg := C.GoString(resp.err) + switch resp.cudaErr { + case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH: + slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg) + case C.CUDA_ERROR_NO_DEVICE: + slog.Info("no nvidia devices detected", "library", libPath) + case C.CUDA_ERROR_UNKNOWN: + slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg) + slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information") + default: + if strings.Contains(msg, "wrong ELF class") { + slog.Debug("skipping 32bit library", "library", libPath) + } else { + slog.Info("unable to load cuda driver library", "library", libPath, "error", msg) + } + } C.free(unsafe.Pointer(resp.err)) } else { return int(resp.num_devices), &resp.ch, libPath diff --git a/gpu/gpu_info_nvcuda.c b/gpu/gpu_info_nvcuda.c index abe14084..a1a38bfc 100644 --- a/gpu/gpu_info_nvcuda.c +++ b/gpu/gpu_info_nvcuda.c @@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { CUresult ret; resp->err = NULL; resp->num_devices = 0; + resp->cudaErr = CUDA_SUCCESS; const int buflen = 256; char buf[buflen + 1]; int i; @@ -38,6 +39,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { nvcuda_lib_path, msg); free(msg); resp->err = strdup(buf); + resp->cudaErr = -1; return; } @@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { msg); free(msg); resp->err = strdup(buf); + resp->cudaErr = -1; return; } } @@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { LOG(resp->ch.verbose, "cuInit err: %d\n", ret); UNLOAD_LIBRARY(resp->ch.handle); resp->ch.handle = NULL; - if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) { - resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama"); - return; - } - snprintf(buf, buflen, "nvcuda init failure: %d", ret); + snprintf(buf, buflen, "cuda driver library init failure: %d", ret); resp->err = strdup(buf); + resp->cudaErr = ret; return; } @@ -91,6 +91,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) { resp->ch.handle = NULL; snprintf(buf, buflen, "unable to get device count: %d", ret); resp->err = strdup(buf); + resp->cudaErr = ret; return; } } @@ -106,13 +107,13 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; if (h.handle == NULL) { - resp->err = strdup("nvcuda handle isn't initialized"); + resp->err = strdup("cuda driver library handle isn't initialized"); return; } ret = (*h.cuDeviceGet)(&device, i); if (ret != CUDA_SUCCESS) { - snprintf(buf, buflen, "nvcuda device failed to initialize"); + snprintf(buf, buflen, "cuda driver library device failed to initialize"); resp->err = strdup(buf); return; } @@ -168,14 +169,14 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { // To get memory we have to set (and release) a context ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); if (ret != CUDA_SUCCESS) { - snprintf(buf, buflen, "nvcuda failed to get device context %d", ret); + snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret); resp->err = strdup(buf); return; } ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total); if (ret != CUDA_SUCCESS) { - snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret); + snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret); resp->err = strdup(buf); // Best effort on failure... (*h.cuCtxDestroy)(ctx); @@ -193,7 +194,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) { ret = (*h.cuCtxDestroy)(ctx); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda failed to release device context %d", ret); + LOG(1, "cuda driver library failed to release device context %d", ret); } } @@ -206,7 +207,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) ret = (*h.cuDeviceGet)(&device, i); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda device failed to initialize"); + LOG(1, "cuda driver library device failed to initialize"); return; } @@ -214,13 +215,13 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) // To get memory we have to set (and release) a context ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda failed to get device context %d", ret); + LOG(1, "cuda driver library failed to get device context %d", ret); return; } ret = (*h.cuMemGetInfo_v2)(free, total); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda device memory info lookup failure %d", ret); + LOG(1, "cuda driver library device memory info lookup failure %d", ret); // Best effort on failure... (*h.cuCtxDestroy)(ctx); return; @@ -228,12 +229,12 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) ret = (*h.cuCtxDestroy)(ctx); if (ret != CUDA_SUCCESS) { - LOG(1, "nvcuda failed to release device context %d", ret); + LOG(1, "cuda driver library failed to release device context %d", ret); } } void nvcuda_release(nvcuda_handle_t h) { - LOG(h.verbose, "releasing nvcuda library\n"); + LOG(h.verbose, "releasing cuda driver library\n"); UNLOAD_LIBRARY(h.handle); // TODO and other context release logic? h.handle = NULL; diff --git a/gpu/gpu_info_nvcuda.h b/gpu/gpu_info_nvcuda.h index f9654f64..ef2fe8a3 100644 --- a/gpu/gpu_info_nvcuda.h +++ b/gpu/gpu_info_nvcuda.h @@ -7,9 +7,12 @@ typedef enum cudaError_enum { CUDA_SUCCESS = 0, CUDA_ERROR_INVALID_VALUE = 1, - CUDA_ERROR_MEMORY_ALLOCATION = 2, + CUDA_ERROR_OUT_OF_MEMORY = 2, CUDA_ERROR_NOT_INITIALIZED = 3, CUDA_ERROR_INSUFFICIENT_DRIVER = 35, + CUDA_ERROR_NO_DEVICE = 100, + CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, + CUDA_ERROR_UNKNOWN = 999, // Other values omitted for now... } CUresult; @@ -64,6 +67,7 @@ typedef struct nvcuda_init_resp { char *err; // If err is non-null handle is invalid nvcuda_handle_t ch; int num_devices; + CUresult cudaErr; } nvcuda_init_resp_t; void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);