From 6f351bf586642e0c1c7086af028cdff0e856a254 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 5 Jun 2024 12:07:20 -0700 Subject: [PATCH] review comments and coverage --- gpu/amd_linux.go | 34 ++----- gpu/amd_windows.go | 3 +- gpu/cpu_common.go | 8 +- gpu/gpu.go | 191 +++++------------------------------- gpu/gpu_darwin.go | 4 +- gpu/gpu_info_cpu.c | 41 -------- gpu/gpu_info_oneapi.c | 100 ++++++++----------- gpu/gpu_info_oneapi.h | 67 +++++-------- gpu/gpu_linux.go | 89 +++++++++++++++++ gpu/gpu_windows.go | 55 +++++++++++ gpu/types.go | 33 +++---- integration/context_test.go | 3 +- llm/memory.go | 67 +++++-------- llm/memory_test.go | 67 +++++++------ llm/payload.go | 16 +-- llm/server.go | 4 +- server/sched.go | 9 +- server/sched_test.go | 40 +++++++- 18 files changed, 375 insertions(+), 456 deletions(-) delete mode 100644 gpu/gpu_info_cpu.c create mode 100644 gpu/gpu_linux.go create mode 100644 gpu/gpu_windows.go diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 97c8274f..61e6a059 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -178,7 +178,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { // Shouldn't happen, but just in case... if gpuID < 0 { slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue") - return []RocmGPUInfo{} + return nil } if int(major) < RocmComputeMin { @@ -205,22 +205,17 @@ func AMDGetGPUInfo() []RocmGPUInfo { matched := true for _, m := range mapping { if m.id == 0 { + // Null ID means it didn't populate, so we can't use it to match continue } filename := filepath.Join(devDir, m.filename) - fp, err := os.Open(filename) - if err != nil { - slog.Debug("failed to open sysfs node", "file", filename, "error", err) - matched = false - break - } - defer fp.Close() - buf, err := io.ReadAll(fp) + buf, err := os.ReadFile(filename) if err != nil { slog.Debug("failed to read sysfs node", "file", filename, "error", err) matched = false break } + // values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64) if err != nil { slog.Debug("failed to parse sysfs node", "file", filename, "error", err) @@ -239,13 +234,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { // Found the matching DRM directory slog.Debug("matched", "amdgpu", match, "drm", devDir) totalFile := filepath.Join(devDir, DRMTotalMemoryFile) - totalFp, err := os.Open(totalFile) - if err != nil { - slog.Debug("failed to open sysfs node", "file", totalFile, "error", err) - break - } - defer totalFp.Close() - buf, err := io.ReadAll(totalFp) + buf, err := os.ReadFile(totalFile) if err != nil { slog.Debug("failed to read sysfs node", "file", totalFile, "error", err) break @@ -284,7 +273,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { TotalMemory: totalMemory, FreeMemory: (totalMemory - usedMemory), }, - ID: fmt.Sprintf("%d", gpuID), + ID: strconv.Itoa(gpuID), Name: name, Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch), MinimumMemory: rocmMinimumMemory, @@ -315,7 +304,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { libDir, err = AMDValidateLibDir() if err != nil { slog.Warn("unable to verify rocm library, will use cpu", "error", err) - return []RocmGPUInfo{} + return nil } } gpuInfo.DependencyPath = libDir @@ -326,7 +315,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { supported, err = GetSupportedGFX(libDir) if err != nil { slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err) - return []RocmGPUInfo{} + return nil } slog.Debug("rocm supported GPUs", "types", supported) } @@ -434,12 +423,7 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error { } func getFreeMemory(usedFile string) (uint64, error) { - usedFp, err := os.Open(usedFile) - if err != nil { - return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err) - } - defer usedFp.Close() - buf, err := io.ReadAll(usedFp) + buf, err := os.ReadFile(usedFile) if err != nil { return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err) } diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 9bba40f8..cad45f6c 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "slices" + "strconv" "strings" "github.com/ollama/ollama/format" @@ -124,7 +125,7 @@ func AMDGetGPUInfo() []RocmGPUInfo { TotalMemory: totalMemory, FreeMemory: freeMemory, }, - ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices + ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices DependencyPath: libDir, MinimumMemory: rocmMinimumMemory, Name: name, diff --git a/gpu/cpu_common.go b/gpu/cpu_common.go index 0e0c55ea..63e88f25 100644 --- a/gpu/cpu_common.go +++ b/gpu/cpu_common.go @@ -4,11 +4,7 @@ import ( "golang.org/x/sys/cpu" ) -func GetCPUVariant() string { - return getCPUCapability().ToVariant() -} - -func getCPUCapability() CPUCapability { +func GetCPUCapability() CPUCapability { if cpu.X86.HasAVX2 { return CPUCapabilityAVX2 } @@ -16,5 +12,5 @@ func getCPUCapability() CPUCapability { return CPUCapabilityAVX } // else LCD - return CPUCapabilityBase + return CPUCapabilityNone } diff --git a/gpu/gpu.go b/gpu/gpu.go index 9a01c363..56a4dbfa 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -11,8 +11,6 @@ package gpu */ import "C" import ( - "bufio" - "bytes" "fmt" "log/slog" "os" @@ -66,54 +64,6 @@ var RocmComputeMin = 9 // TODO find a better way to detect iGPU instead of minimum memory const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU -var CudartLinuxGlobs = []string{ - "/usr/local/cuda/lib64/libcudart.so*", - "/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*", - "/usr/lib/x86_64-linux-gnu/libcudart.so*", - "/usr/lib/wsl/lib/libcudart.so*", - "/usr/lib/wsl/drivers/*/libcudart.so*", - "/opt/cuda/lib64/libcudart.so*", - "/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*", - "/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*", - "/usr/lib/aarch64-linux-gnu/libcudart.so*", - "/usr/local/cuda/lib*/libcudart.so*", - "/usr/lib*/libcudart.so*", - "/usr/local/lib*/libcudart.so*", -} - -var CudartWindowsGlobs = []string{ - "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", -} - -var NvmlWindowsGlobs = []string{ - "c:\\Windows\\System32\\nvml.dll", -} - -var NvcudaLinuxGlobs = []string{ - "/usr/local/cuda*/targets/*/lib/libcuda.so*", - "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*", - "/usr/lib/*-linux-gnu/libcuda.so*", - "/usr/lib/wsl/lib/libcuda.so*", - "/usr/lib/wsl/drivers/*/libcuda.so*", - "/opt/cuda/lib*/libcuda.so*", - "/usr/local/cuda/lib*/libcuda.so*", - "/usr/lib*/libcuda.so*", - "/usr/local/lib*/libcuda.so*", -} - -var NvcudaWindowsGlobs = []string{ - "c:\\windows\\system*\\nvcuda.dll", -} - -var OneapiWindowsGlobs = []string{ - "c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll", -} - -var OneapiLinuxGlobs = []string{ - "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*", - "/usr/lib*/libze_intel_gpu.so*", -} - // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. var CudaTegra string = os.Getenv("JETSON_JETPACK") @@ -139,47 +89,24 @@ func initCudaHandles() *cudaHandles { } slog.Debug("searching for GPU discovery libraries for NVIDIA") - var cudartMgmtName string var cudartMgmtPatterns []string - var nvcudaMgmtName string - var nvcudaMgmtPatterns []string - var nvmlMgmtName string - var nvmlMgmtPatterns []string - tmpDir, _ := PayloadsDir() - switch runtime.GOOS { - case "windows": - cudartMgmtName = "cudart64_*.dll" + // Aligned with driver, we can't carry as payloads + nvcudaMgmtPatterns := NvcudaGlobs + + if runtime.GOOS == "windows" { localAppData := os.Getenv("LOCALAPPDATA") - cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)} - cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...) - // Aligned with driver, we can't carry as payloads - nvcudaMgmtName = "nvcuda.dll" - nvcudaMgmtPatterns = NvcudaWindowsGlobs - - // Use nvml to refresh free memory on windows only - nvmlMgmtName = "nvml.dll" - nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs)) - copy(nvmlMgmtPatterns, NvmlWindowsGlobs) - - case "linux": - cudartMgmtName = "libcudart.so*" - if tmpDir != "" { - // TODO - add "payloads" for subprocess - cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)} - } - cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...) - // Aligned with driver, we can't carry as payloads - nvcudaMgmtName = "libcuda.so*" - nvcudaMgmtPatterns = NvcudaLinuxGlobs - - // nvml omitted on linux - default: - return cHandles + cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)} } + tmpDir, _ := PayloadsDir() + if tmpDir != "" { + // TODO - add "payloads" for subprocess + cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)} + } + cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...) - if len(nvmlMgmtPatterns) > 0 { - nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns) + if len(NvmlGlobs) > 0 { + nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs) if len(nvmlLibPaths) > 0 { nvml, libPath := LoadNVMLMgmt(nvmlLibPaths) if nvml != nil { @@ -190,7 +117,7 @@ func initCudaHandles() *cudaHandles { } } - nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns) + nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns) if len(nvcudaLibPaths) > 0 { deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths) if nvcuda != nil { @@ -202,7 +129,7 @@ func initCudaHandles() *cudaHandles { } } - cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns) + cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns) if len(cudartLibPaths) > 0 { deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths) if cudart != nil { @@ -220,8 +147,6 @@ func initCudaHandles() *cudaHandles { // Note: gpuMutex must already be held func initOneAPIHandles() *oneapiHandles { oHandles := &oneapiHandles{} - var oneapiMgmtName string - var oneapiMgmtPatterns []string // Short Circuit if we already know which library to use if oneapiLibPath != "" { @@ -229,18 +154,7 @@ func initOneAPIHandles() *oneapiHandles { return oHandles } - switch runtime.GOOS { - case "windows": - oneapiMgmtName = "ze_intel_gpu64.dll" - oneapiMgmtPatterns = OneapiWindowsGlobs - case "linux": - oneapiMgmtName = "libze_intel_gpu.so" - oneapiMgmtPatterns = OneapiLinuxGlobs - default: - return oHandles - } - - oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns) + oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs) if len(oneapiLibPaths) > 0 { oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths) } @@ -290,7 +204,7 @@ func GetGPUInfo() GpuInfoList { if !bootstrapped { slog.Debug("Detecting GPUs") needRefresh = false - cpuCapability = getCPUCapability() + cpuCapability = GetCPUCapability() var memInfo C.mem_info_t mem, err := GetCPUMem() @@ -301,14 +215,14 @@ func GetGPUInfo() GpuInfoList { GpuInfo: GpuInfo{ memInfo: mem, Library: "cpu", - Variant: cpuCapability.ToVariant(), + Variant: cpuCapability, ID: "0", }, }} // Fallback to CPU mode if we're lacking required vector extensions on x86 if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" { - slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString()) + slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability) bootstrapped = true // No need to do any GPU discovery, since we can't run on them return GpuInfoList{cpus[0].GpuInfo} @@ -357,8 +271,8 @@ func GetGPUInfo() GpuInfoList { gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.DependencyPath = depPath gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - gpuInfo.DriverMajor = int(driverMajor) - gpuInfo.DriverMinor = int(driverMinor) + gpuInfo.DriverMajor = driverMajor + gpuInfo.DriverMinor = driverMinor // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... cudaGPUs = append(cudaGPUs, gpuInfo) @@ -374,16 +288,16 @@ func GetGPUInfo() GpuInfoList { continue } devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d)) - for i := 0; i < int(devCount); i++ { + for i := range devCount { gpuInfo := OneapiGPUInfo{ GpuInfo: GpuInfo{ Library: "oneapi", }, driverIndex: d, - gpuIndex: i, + gpuIndex: int(i), } // TODO - split bootstrapping from updating free memory - C.oneapi_check_vram(*oHandles.oneapi, C.int(d), C.int(i), &memInfo) + C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo) // TODO - convert this to MinimumMemory based on testing... var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. memInfo.free = C.uint64_t(totalFreeMem) @@ -505,22 +419,6 @@ func GetGPUInfo() GpuInfoList { return resp } -func GetCPUMem() (memInfo, error) { - if runtime.GOOS == "linux" { - return GetLinuxMemInfo() - } - var ret memInfo - var info C.mem_info_t - C.cpu_check_ram(&info) - if info.err != nil { - defer C.free(unsafe.Pointer(info.err)) - return ret, fmt.Errorf(C.GoString(info.err)) - } - ret.FreeMemory = uint64(info.free) - ret.TotalMemory = uint64(info.total) - return ret, nil -} - func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string @@ -646,7 +544,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) { slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err)) C.free(unsafe.Pointer(resp.err)) } else { - for i := 0; i < int(resp.oh.num_drivers); i++ { + for i := range resp.oh.num_drivers { num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i))) } return num_devices, &resp.oh, libPath @@ -682,42 +580,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return "", "" } } - -func GetLinuxMemInfo() (memInfo, error) { - var mem memInfo - var total, available, free, buffers, cached uint64 - f, err := os.Open("/proc/meminfo") - if err != nil { - return mem, err - } - defer f.Close() - s := bufio.NewScanner(f) - for s.Scan() { - switch { - case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)): - _, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total) - case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)): - _, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available) - case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)): - _, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free) - case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)): - _, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers) - case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)): - _, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached) - default: - continue - } - if err != nil { - return mem, err - } - - if total > 0 && available > 0 { - mem.TotalMemory = total * 1024 - mem.FreeMemory = available * 1024 - return mem, nil - } - } - mem.TotalMemory = total * 1024 - mem.FreeMemory = (free + buffers + cached) * 1024 - return mem, nil -} diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index d21be410..f26d23c1 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUVariant(), + Variant: GetCPUCapability(), memInfo: mem, }, } @@ -47,7 +47,7 @@ func GetCPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUVariant(), + Variant: GetCPUCapability(), memInfo: mem, }, } diff --git a/gpu/gpu_info_cpu.c b/gpu/gpu_info_cpu.c deleted file mode 100644 index f7d849d7..00000000 --- a/gpu/gpu_info_cpu.c +++ /dev/null @@ -1,41 +0,0 @@ -#include "gpu_info.h" -// Fallbacks for CPU mode - -#ifdef _WIN32 -#include -void cpu_check_ram(mem_info_t *resp) { - resp->err = NULL; - MEMORYSTATUSEX info; - info.dwLength = sizeof(info); - if (GlobalMemoryStatusEx(&info) != 0) { - resp->total = info.ullTotalPhys; - resp->free = info.ullAvailPhys; - snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); - } else { - resp->err = LOAD_ERR(); - } - return; -} - -#elif __linux__ -#include -#include -#include -void cpu_check_ram(mem_info_t *resp) { - struct sysinfo info; - resp->err = NULL; - if (sysinfo(&info) != 0) { - resp->err = strdup(strerror(errno)); - } else { - resp->total = info.totalram * info.mem_unit; - resp->free = info.freeram * info.mem_unit; - snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0"); - } - return; -} - -#elif __APPLE__ -// Unused - see gpu_darwin.go -#else -#error "Unsupported platform" -#endif diff --git a/gpu/gpu_info_oneapi.c b/gpu/gpu_info_oneapi.c index cc58f7a2..e90c694a 100644 --- a/gpu/gpu_info_oneapi.c +++ b/gpu/gpu_info_oneapi.c @@ -4,8 +4,7 @@ #include -void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) -{ +void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) { ze_result_t ret; resp->err = NULL; resp->oh.devices = NULL; @@ -15,8 +14,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) const int buflen = 256; char buf[buflen + 1]; int i, d, count; - struct lookup - { + struct lookup { char *s; void **p; } l[] = { @@ -32,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) }; resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY); - if (!resp->oh.handle) - { + if (!resp->oh.handle) { char *msg = LOAD_ERR(); snprintf(buf, buflen, "Unable to load %s library to query for Intel GPUs: %s\n", @@ -48,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) "wiring Level-Zero management library functions in %s\n", oneapi_lib_path); - for (i = 0; l[i].s != NULL; i++) - { + for (i = 0; l[i].s != NULL; i++) { // TODO once we've squashed the remaining corner cases remove this log LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s); *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s); - if (!l[i].p) - { + if (!l[i].p) { resp->oh.handle = NULL; char *msg = LOAD_ERR(); LOG(resp->oh.verbose, "dlerr: %s\n", msg); @@ -68,8 +63,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) } ret = (*resp->oh.zesInit)(0); - if (ret != ZE_RESULT_SUCCESS) - { + if (ret != ZE_RESULT_SUCCESS) { LOG(resp->oh.verbose, "zesInit err: %x\n", ret); snprintf(buf, buflen, "oneapi vram init failure: %x", ret); resp->err = strdup(buf); @@ -79,8 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) count = 0; ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL); - if (ret != ZE_RESULT_SUCCESS) - { + if (ret != ZE_RESULT_SUCCESS) { LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret); snprintf(buf, buflen, "unable to get driver count: %x", ret); resp->err = strdup(buf); @@ -91,10 +84,10 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t)); resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t)); memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t)); - resp->oh.devices = malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t*)); + resp->oh.devices = + malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *)); ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]); - if (ret != ZE_RESULT_SUCCESS) - { + if (ret != ZE_RESULT_SUCCESS) { LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret); snprintf(buf, buflen, "unable to get driver count: %x", ret); resp->err = strdup(buf); @@ -103,19 +96,20 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) } for (d = 0; d < resp->oh.num_drivers; d++) { - ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], NULL); - if (ret != ZE_RESULT_SUCCESS) - { + ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], + &resp->oh.num_devices[d], NULL); + if (ret != ZE_RESULT_SUCCESS) { LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret); snprintf(buf, buflen, "unable to get device count: %x", ret); resp->err = strdup(buf); oneapi_release(resp->oh); return; } - resp->oh.devices[d] = malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t)); - ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]); - if (ret != ZE_RESULT_SUCCESS) - { + resp->oh.devices[d] = + malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t)); + ret = (*resp->oh.zesDeviceGet)( + resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]); + if (ret != ZE_RESULT_SUCCESS) { LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret); snprintf(buf, buflen, "unable to get device count: %x", ret); resp->err = strdup(buf); @@ -128,8 +122,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) return; } -void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp) -{ +void oneapi_check_vram(oneapi_handle_t h, int driver, int device, + mem_info_t *resp) { ze_result_t ret; resp->err = NULL; uint64_t totalMem = 0; @@ -138,12 +132,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re char buf[buflen + 1]; int i, d, m; - if (h.handle == NULL) - { + if (h.handle == NULL) { resp->err = strdup("Level-Zero handle not initialized"); return; } - + if (driver > h.num_drivers || device > h.num_devices[driver]) { resp->err = strdup("driver of device index out of bounds"); return; @@ -161,8 +154,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re props.pNext = &ext_props; ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props); - if (ret != ZE_RESULT_SUCCESS) - { + if (ret != ZE_RESULT_SUCCESS) { snprintf(buf, buflen, "unable to get device properties: %d", ret); resp->err = strdup(buf); return; @@ -175,8 +167,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re // TODO - the driver isn't included - what if there are multiple drivers? snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device); - if (h.verbose) - { + if (h.verbose) { // When in verbose mode, report more information about // the card we discover. LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device, @@ -195,11 +186,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re // Compute Capability equivalent in resp->major, resp->minor, resp->patch uint32_t memCount = 0; - ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, NULL); - if (ret != ZE_RESULT_SUCCESS) - { - snprintf(buf, buflen, - "unable to enumerate Level-Zero memory modules: %x", ret); + ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, + NULL); + if (ret != ZE_RESULT_SUCCESS) { + snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x", + ret); resp->err = strdup(buf); return; } @@ -209,14 +200,12 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t)); (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems); - for (m = 0; m < memCount; m++) - { + for (m = 0; m < memCount; m++) { zes_mem_state_t state; state.stype = ZES_STRUCTURE_TYPE_MEM_STATE; state.pNext = NULL; ret = (*h.zesMemoryGetState)(mems[m], &state); - if (ret != ZE_RESULT_SUCCESS) - { + if (ret != ZE_RESULT_SUCCESS) { snprintf(buf, buflen, "unable to get memory state: %x", ret); resp->err = strdup(buf); free(mems); @@ -230,29 +219,23 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re free(mems); } -void oneapi_release(oneapi_handle_t h) -{ +void oneapi_release(oneapi_handle_t h) { int d; LOG(h.verbose, "releasing oneapi library\n"); - for (d = 0; d < h.num_drivers; d++) - { - if (h.devices != NULL && h.devices[d] != NULL) - { + for (d = 0; d < h.num_drivers; d++) { + if (h.devices != NULL && h.devices[d] != NULL) { free(h.devices[d]); } } - if (h.devices != NULL) - { + if (h.devices != NULL) { free(h.devices); h.devices = NULL; } - if (h.num_devices != NULL) - { + if (h.num_devices != NULL) { free(h.num_devices); h.num_devices = NULL; } - if (h.drivers != NULL) - { + if (h.drivers != NULL) { free(h.drivers); h.drivers = NULL; } @@ -261,14 +244,11 @@ void oneapi_release(oneapi_handle_t h) h.handle = NULL; } -int oneapi_get_device_count(oneapi_handle_t h, int driver) -{ - if (h.handle == NULL || h.num_devices == NULL) - { +int oneapi_get_device_count(oneapi_handle_t h, int driver) { + if (h.handle == NULL || h.num_devices == NULL) { return 0; } - if (driver > h.num_drivers) - { + if (driver > h.num_drivers) { return 0; } return (int)h.num_devices[driver]; diff --git a/gpu/gpu_info_oneapi.h b/gpu/gpu_info_oneapi.h index 7607935c..97fcecd9 100644 --- a/gpu/gpu_info_oneapi.h +++ b/gpu/gpu_info_oneapi.h @@ -9,8 +9,7 @@ #define ZE_BIT(_i) (1 << _i) // Just enough typedef's to dlopen/dlsym for memory information -typedef enum ze_result_t -{ +typedef enum ze_result_t { ZE_RESULT_SUCCESS = 0, // Other values omitted for now... } ze_result_t; @@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t; typedef struct _zes_device_handle_t *zes_device_handle_t; typedef struct _zes_mem_handle_t *zes_mem_handle_t; -typedef enum _ze_structure_type_t -{ +typedef enum _ze_structure_type_t { ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff } ze_structure_type_t; -typedef enum _zes_structure_type_t -{ +typedef enum _zes_structure_type_t { ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1, ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb, ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e, @@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff } zes_structure_type_t; -typedef enum _zes_mem_type_t -{ +typedef enum _zes_mem_type_t { ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff } zes_mem_type_t; -typedef enum _zes_mem_loc_t -{ +typedef enum _zes_mem_loc_t { ZES_MEM_LOC_SYSTEM = 0, ZES_MEM_LOC_DEVICE = 1, ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff } zes_mem_loc_t; -typedef enum _zes_mem_health_t -{ +typedef enum _zes_mem_health_t { ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff } zes_mem_health_t; -typedef struct _ze_device_uuid_t -{ +typedef struct _ze_device_uuid_t { uint8_t id[ZE_MAX_DEVICE_UUID_SIZE]; } ze_device_uuid_t; -typedef struct _zes_uuid_t -{ +typedef struct _zes_uuid_t { uint8_t id[ZE_MAX_DEVICE_UUID_SIZE]; } zes_uuid_t; -typedef enum _ze_device_type_t -{ +typedef enum _ze_device_type_t { ZE_DEVICE_TYPE_GPU = 1, ZE_DEVICE_TYPE_CPU = 2, ZE_DEVICE_TYPE_FPGA = 3, @@ -71,8 +62,7 @@ typedef enum _ze_device_type_t ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff } ze_device_type_t; -typedef enum _zes_device_type_t -{ +typedef enum _zes_device_type_t { ZES_DEVICE_TYPE_GPU = 1, ZES_DEVICE_TYPE_CPU = 2, ZES_DEVICE_TYPE_FPGA = 3, @@ -82,8 +72,7 @@ typedef enum _zes_device_type_t } zes_device_type_t; typedef uint32_t ze_device_property_flags_t; -typedef enum _ze_device_property_flag_t -{ +typedef enum _ze_device_property_flag_t { ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1), ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2), @@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t } ze_device_property_flag_t; typedef uint32_t zes_device_property_flags_t; -typedef enum _zes_device_property_flag_t -{ +typedef enum _zes_device_property_flag_t { ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1), ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2), @@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff } zes_device_property_flag_t; -typedef struct _ze_device_properties_t -{ +typedef struct _ze_device_properties_t { ze_structure_type_t stype; void *pNext; ze_device_type_t type; @@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t char name[ZE_MAX_DEVICE_NAME]; } ze_device_properties_t; -typedef struct _zes_device_properties_t -{ +typedef struct _zes_device_properties_t { zes_structure_type_t stype; void *pNext; ze_device_properties_t core; @@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t char driverVersion[ZES_STRING_PROPERTY_SIZE]; } zes_device_properties_t; -typedef struct _zes_device_ext_properties_t -{ +typedef struct _zes_device_ext_properties_t { zes_structure_type_t stype; void *pNext; zes_uuid_t uuid; @@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t zes_device_property_flags_t flags; } zes_device_ext_properties_t; -typedef struct _zes_mem_properties_t -{ +typedef struct _zes_mem_properties_t { zes_structure_type_t stype; void *pNext; zes_mem_type_t type; @@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t int32_t numChannels; } zes_mem_properties_t; -typedef struct _zes_mem_state_t -{ +typedef struct _zes_mem_state_t { zes_structure_type_t stype; const void *pNext; zes_mem_health_t health; @@ -171,15 +154,14 @@ typedef struct _zes_mem_state_t uint64_t size; } zes_mem_state_t; -typedef struct oneapi_handle -{ +typedef struct oneapi_handle { void *handle; uint16_t verbose; uint32_t num_drivers; - zes_driver_handle_t *drivers; + zes_driver_handle_t *drivers; uint32_t *num_devices; - zes_device_handle_t **devices; + zes_device_handle_t **devices; // TODO Driver major, minor information // int driver_major; @@ -201,20 +183,19 @@ typedef struct oneapi_handle } oneapi_handle_t; -typedef struct oneapi_init_resp -{ +typedef struct oneapi_init_resp { char *err; // If err is non-null handle is invalid oneapi_handle_t oh; } oneapi_init_resp_t; -typedef struct oneapi_version_resp -{ +typedef struct oneapi_version_resp { ze_result_t status; char *str; // Contains version or error string if status != 0 } oneapi_version_resp_t; void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp); -void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp); +void oneapi_check_vram(oneapi_handle_t h, int driver, int device, + mem_info_t *resp); void oneapi_release(oneapi_handle_t h); int oneapi_get_device_count(oneapi_handle_t h, int driver); diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go new file mode 100644 index 00000000..a099bf82 --- /dev/null +++ b/gpu/gpu_linux.go @@ -0,0 +1,89 @@ +package gpu + +import ( + "bufio" + "fmt" + "os" + "strings" + + "github.com/ollama/ollama/format" +) + +var CudartGlobs = []string{ + "/usr/local/cuda/lib64/libcudart.so*", + "/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*", + "/usr/lib/x86_64-linux-gnu/libcudart.so*", + "/usr/lib/wsl/lib/libcudart.so*", + "/usr/lib/wsl/drivers/*/libcudart.so*", + "/opt/cuda/lib64/libcudart.so*", + "/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*", + "/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*", + "/usr/lib/aarch64-linux-gnu/libcudart.so*", + "/usr/local/cuda/lib*/libcudart.so*", + "/usr/lib*/libcudart.so*", + "/usr/local/lib*/libcudart.so*", +} + +var NvmlGlobs = []string{} + +var NvcudaGlobs = []string{ + "/usr/local/cuda*/targets/*/lib/libcuda.so*", + "/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*", + "/usr/lib/*-linux-gnu/libcuda.so*", + "/usr/lib/wsl/lib/libcuda.so*", + "/usr/lib/wsl/drivers/*/libcuda.so*", + "/opt/cuda/lib*/libcuda.so*", + "/usr/local/cuda/lib*/libcuda.so*", + "/usr/lib*/libcuda.so*", + "/usr/local/lib*/libcuda.so*", +} + +var OneapiGlobs = []string{ + "/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*", + "/usr/lib*/libze_intel_gpu.so*", +} + +var CudartMgmtName = "libcudart.so*" +var NvcudaMgmtName = "libcuda.so*" +var NvmlMgmtName = "" // not currently wired on linux +var OneapiMgmtName = "libze_intel_gpu.so" + +func GetCPUMem() (memInfo, error) { + var mem memInfo + var total, available, free, buffers, cached uint64 + f, err := os.Open("/proc/meminfo") + if err != nil { + return mem, err + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + line := s.Text() + switch { + case strings.HasPrefix(line, "MemTotal:"): + _, err = fmt.Sscanf(line, "MemTotal:%d", &total) + case strings.HasPrefix(line, "MemAvailable:"): + _, err = fmt.Sscanf(line, "MemAvailable:%d", &available) + case strings.HasPrefix(line, "MemFree:"): + _, err = fmt.Sscanf(line, "MemFree:%d", &free) + case strings.HasPrefix(line, "Buffers:"): + _, err = fmt.Sscanf(line, "Buffers:%d", &buffers) + case strings.HasPrefix(line, "Cached:"): + _, err = fmt.Sscanf(line, "Cached:%d", &cached) + default: + continue + } + if err != nil { + return mem, err + } + + if total > 0 && available > 0 { + mem.TotalMemory = total * format.KibiByte + mem.FreeMemory = available * format.KibiByte + return mem, nil + } + } + mem.TotalMemory = total * format.KibiByte + mem.FreeMemory = (free + buffers + cached) * format.KibiByte + return mem, nil +} diff --git a/gpu/gpu_windows.go b/gpu/gpu_windows.go new file mode 100644 index 00000000..f8c2e76f --- /dev/null +++ b/gpu/gpu_windows.go @@ -0,0 +1,55 @@ +package gpu + +import ( + "fmt" + "syscall" + "unsafe" +) + +type MEMORYSTATUSEX struct { + length uint32 + MemoryLoad uint32 + TotalPhys uint64 + AvailPhys uint64 + TotalPageFile uint64 + AvailPageFile uint64 + TotalVirtual uint64 + AvailVirtual uint64 + AvailExtendedVirtual uint64 +} + +var ( + k32 = syscall.NewLazyDLL("kernel32.dll") + globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx") + sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{})) +) + +var CudartGlobs = []string{ + "c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll", +} + +var NvmlGlobs = []string{ + "c:\\Windows\\System32\\nvml.dll", +} + +var NvcudaGlobs = []string{ + "c:\\windows\\system*\\nvcuda.dll", +} + +var OneapiGlobs = []string{ + "c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll", +} + +var CudartMgmtName = "cudart64_*.dll" +var NvcudaMgmtName = "nvcuda.dll" +var NvmlMgmtName = "nvml.dll" +var OneapiMgmtName = "ze_intel_gpu64.dll" + +func GetCPUMem() (memInfo, error) { + memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx} + r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus))) + if r1 == 0 { + return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err) + } + return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil +} diff --git a/gpu/types.go b/gpu/types.go index 2b1ea429..47355959 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -18,7 +18,7 @@ type GpuInfo struct { Library string `json:"library,omitempty"` // Optional variant to select (e.g. versions, cpu feature flags) - Variant string `json:"variant,omitempty"` + Variant CPUCapability `json:"variant"` // MinimumMemory represents the minimum memory required to use the GPU MinimumMemory uint64 `json:"-"` @@ -44,21 +44,21 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - index int // nolint: unused + index int //nolint:unused,nolintlint } type CudaGPUInfoList []CudaGPUInfo type RocmGPUInfo struct { GpuInfo - usedFilepath string // nolint: unused - index int // nolint: unused + usedFilepath string //nolint:unused,nolintlint + index int //nolint:unused,nolintlint } type RocmGPUInfoList []RocmGPUInfo type OneapiGPUInfo struct { GpuInfo - driverIndex int // nolint: unused - gpuIndex int // nolint: unused + driverIndex int //nolint:unused,nolintlint + gpuIndex int //nolint:unused,nolintlint } type OneapiGPUInfoList []OneapiGPUInfo @@ -71,8 +71,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { for _, info := range l { found := false requested := info.Library - if info.Variant != "" { - requested += "_" + info.Variant + if info.Variant != CPUCapabilityNone { + requested += "_" + info.Variant.String() } for i, lib := range libs { if lib == requested { @@ -117,30 +117,19 @@ type CPUCapability uint32 var GPURunnerCPUCapability = CPUCapabilityAVX const ( - CPUCapabilityBase CPUCapability = iota + CPUCapabilityNone CPUCapability = iota CPUCapabilityAVX CPUCapabilityAVX2 // TODO AVX512 ) -func (c CPUCapability) ToString() string { - switch c { - case CPUCapabilityAVX: - return "AVX" - case CPUCapabilityAVX2: - return "AVX2" - default: - return "no vector extensions" - } -} - -func (c CPUCapability) ToVariant() string { +func (c CPUCapability) String() string { switch c { case CPUCapabilityAVX: return "avx" case CPUCapabilityAVX2: return "avx2" default: - return "" + return "no vector extensions" } } diff --git a/integration/context_test.go b/integration/context_test.go index 025b803d..46fac5ea 100644 --- a/integration/context_test.go +++ b/integration/context_test.go @@ -11,7 +11,8 @@ import ( ) func TestContextExhaustion(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) // Longer needed for small footprint GPUs + // Longer needed for small footprint GPUs + ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) defer cancel() // Set up the test data req := api.GenerateRequest{ diff --git a/llm/memory.go b/llm/memory.go index 6f830cb1..223a1899 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -1,7 +1,6 @@ package llm import ( - "fmt" "log/slog" "strconv" "strings" @@ -69,13 +68,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts // Conditional output size on GPU 0 var memoryLayerOutput uint64 - var includeOutput bool - // One extra layer as a pad for each GPU - var layerBuffer uint64 - - // The sizes of the main layers - var layerSizes []uint64 + // The sizes of a layer + var layerSize uint64 // The sum of all the layer sizes (just for logging) var memoryWeights uint64 @@ -102,12 +97,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts layers := ggml.Tensors().Layers() // add one layer worth of memory as a buffer if blk0, ok := layers["blk.0"]; ok { - layerBuffer = blk0.size() + layerSize = blk0.size() + } else { + slog.Warn("model missing blk.0 layer size") } // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() + // KV is proportional to the number of layers + layerSize += kv / ggml.KV().BlockCount() + graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) if graphPartialOffload == 0 { graphPartialOffload = ggml.KV().GQA() * kv / 6 @@ -119,6 +119,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts // on metal there's no partial offload overhead if gpus[0].Library == "metal" { graphPartialOffload = graphFullOffload + } else if len(gpus) > 1 { + // multigpu should always use the partial graph size + graphFullOffload = graphPartialOffload } if layer, ok := layers["output_norm"]; ok { @@ -130,16 +133,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts memoryLayerOutput += layer.size() } - if gpus[0].Library == "metal" && opts.UseMMap { - includeOutput = true - } else if gpus[0].Library != "metal" || !opts.UseMMap { - includeOutput = true - } - + // Output layer handled at the end if we have space gpuZeroOverhead := projectorSize - if includeOutput { - gpuZeroOverhead += memoryLayerOutput - } // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer var layerCount int @@ -156,12 +151,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts gzo = gpuZeroOverhead } // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer - if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer { + if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i]) continue } gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) - gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full + gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full } var gpuZeroID int @@ -170,23 +165,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts gpuAllocations[gpuZeroID] += gpuZeroOverhead } - layerSizes = make([]uint64, int(ggml.KV().BlockCount())) - for i := range int(ggml.KV().BlockCount()) { - if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { - memoryLayer := blk.size() - - // KV is proportional to the number of layers - memoryLayer += kv / ggml.KV().BlockCount() - layerSizes[i] = memoryLayer - memoryWeights += memoryLayer - } - } - // For all the layers, find where they can fit on the GPU(s) - for i := range layerSizes { - if layerSizes[i] == 0 { - continue - } + for i := range int(ggml.KV().BlockCount()) { + memoryWeights += layerSize + if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { // Stop allocating on GPU(s) once we hit the users target NumGPU continue @@ -196,8 +178,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts for j := len(gpusWithSpace); j > 0; j-- { g := gpusWithSpace[i%j] used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > used+layerSizes[i] { - gpuAllocations[g.i] += layerSizes[i] + if g.g.FreeMemory > used+layerSize { + gpuAllocations[g.i] += layerSize layerCounts[g.i]++ layerCount++ break @@ -205,17 +187,18 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) } } - } if layerCount >= int(ggml.KV().BlockCount()) { fullyLoaded = true } else { for i := layerCount; i < int(ggml.KV().BlockCount()); i++ { - overflow += layerSizes[i] + overflow += layerSize } } - // Find where the output fits - if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { + + // Determine if we need to consider output then find where it fits + if ((gpus[0].Library == "metal" && opts.UseMMap) || (gpus[0].Library != "metal" || !opts.UseMMap)) && + memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { for j := len(gpusWithSpace); j > 0; j-- { g := gpusWithSpace[layerCount%j] used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) @@ -226,6 +209,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts break } } + if layerCount < int(ggml.KV().BlockCount())+1 { fullyLoaded = false overflow += memoryLayerOutput @@ -253,7 +237,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts var memoryRequiredPartial, memoryRequiredTotal uint64 for i := range gpuAllocations { memoryRequiredPartial += gpuAllocations[i] - } memoryRequiredTotal = memoryRequiredPartial + overflow diff --git a/llm/memory_test.go b/llm/memory_test.go index 0adbc541..8eaa0771 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -18,7 +18,7 @@ func TestEstimateGPULayers(t *testing.T) { envconfig.Debug = true modelName := "dummy" f, err := os.CreateTemp(t.TempDir(), modelName) - assert.Nil(t, err) + require.NoError(t, err) defer f.Close() gguf := NewGGUFV3(binary.LittleEndian) inputLayerCount := 5 @@ -30,7 +30,7 @@ func TestEstimateGPULayers(t *testing.T) { {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, } - assert.Equal(t, inputLayerCount+1, len(tensors)) + assert.Len(t, tensors, inputLayerCount+1) err = gguf.Encode(f, KV{ "general.architecture": "llama", "general.name": "name", @@ -56,9 +56,11 @@ func TestEstimateGPULayers(t *testing.T) { } projectors := []string{} opts := api.DefaultOptions() - estimate := EstimateGPULayers(gpus, ggml, projectors, opts) - assert.Equal(t, 0, estimate.Layers) - assert.Equal(t, uint64(0), estimate.Graph) + t.Run("cpu", func(t *testing.T) { + estimate := EstimateGPULayers(gpus, ggml, projectors, opts) + assert.Equal(t, 0, estimate.Layers) + assert.Equal(t, uint64(0), estimate.Graph) + }) // derived from the dummy ggml file above graphPartialOffload := uint64(202377216) @@ -80,7 +82,10 @@ func TestEstimateGPULayers(t *testing.T) { }, } // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 - for i, s := range [][]uint64{ + for i, s := range []struct { + layer0, layer1 uint64 + expect0, expect1 uint64 + }{ {1, 1, 1, 1}, {2, 1, 2, 1}, {2, 2, 2, 2}, @@ -90,27 +95,33 @@ func TestEstimateGPULayers(t *testing.T) { {6, 6, 3, 3}, {0, 3, 0, 3}, } { - gpus[0].FreeMemory = 0 - gpus[1].FreeMemory = 0 - gpus[0].FreeMemory += projectorSize + memoryLayerOutput - gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1 - gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1 - gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) - gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) - estimate = EstimateGPULayers(gpus, ggml, projectors, opts) - assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s) - assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s) - var layerSums uint64 - for _, b := range estimate.GPUSizes { - layerSums += b - } - if estimate.Layers < inputLayerCount+1 { - assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) - assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate) - } else { - assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) - assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate) - } + t.Run(fmt.Sprintf("%v", s), func(t *testing.T) { + gpus[0].FreeMemory = 0 + gpus[1].FreeMemory = 0 + gpus[0].FreeMemory += projectorSize + if s.layer0 > 0 { + gpus[0].FreeMemory += memoryLayerOutput + } else { + gpus[1].FreeMemory += memoryLayerOutput + } + gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1 + gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1 + gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) + gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) + estimate := EstimateGPULayers(gpus, ggml, projectors, opts) + assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s) + assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s) + var layerSums uint64 + for _, b := range estimate.GPUSizes { + layerSums += b + } + if estimate.Layers < inputLayerCount+1 { + assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) + assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate) + } else { + assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) + assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate) + } + }) } - } diff --git a/llm/payload.go b/llm/payload.go index a025ee34..20dcee7b 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string { // glob workDir for files that start with ollama_ availableServers := availableServers() requested := info.Library - if info.Variant != "" { - requested += "_" + info.Variant + if info.Variant != gpu.CPUCapabilityNone { + requested += "_" + info.Variant.String() } servers := []string{} @@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string { // Load up the best CPU variant if not primary requested if info.Library != "cpu" { - variant := gpu.GetCPUVariant() + variant := gpu.GetCPUCapability() // If no variant, then we fall back to default // If we have a variant, try that if we find an exact match // Attempting to run the wrong CPU instructions will panic the // process - if variant != "" { + if variant != gpu.CPUCapabilityNone { for cmp := range availableServers { - if cmp == "cpu_"+variant { + if cmp == "cpu_"+variant.String() { servers = append(servers, cmp) break } @@ -146,11 +146,11 @@ func serverForCpu() string { if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { return "metal" } - variant := gpu.GetCPUVariant() + variant := gpu.GetCPUCapability() availableServers := availableServers() - if variant != "" { + if variant != gpu.CPUCapabilityNone { for cmp := range availableServers { - if cmp == "cpu_"+variant { + if cmp == "cpu_"+variant.String() { return cmp } } diff --git a/llm/server.go b/llm/server.go index 089c35d1..6313fc32 100644 --- a/llm/server.go +++ b/llm/server.go @@ -39,7 +39,7 @@ type LlamaServer interface { Close() error EstimatedVRAM() uint64 // Total VRAM across all GPUs EstimatedTotal() uint64 - EstimagedVRAMByGPU(gpuID string) uint64 + EstimatedVRAMByGPU(gpuID string) uint64 } // llmServer is an instance of the llama.cpp server @@ -1016,7 +1016,7 @@ func (s *llmServer) EstimatedTotal() uint64 { return s.estimate.TotalSize } -func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 { +func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 { for i, gpu := range s.gpus { if gpu.ID == gpuID { return s.estimate.GPUSizes[i] diff --git a/server/sched.go b/server/sched.go index e1ceccc1..42439554 100644 --- a/server/sched.go +++ b/server/sched.go @@ -182,7 +182,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // We want to avoid loading on any GPUs that have other // models still loading on them to avoid potential races // with VRAM consumption ramping up during load - availGpus := s.filterGPUsWithLoadingModels(gpus) + availGpus := s.filterGPUsWithoutLoadingModels(gpus) // Update free memory from currently loaded models s.updateFreeSpace(availGpus) @@ -414,9 +414,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) { r.refMu.Lock() if r.llama != nil { for _, gpu := range allGpus { - // if slices.Contains(gpuIDs, gpu.ID) { - predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID) - // } + predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID) } } else { slog.Warn("unexpected nil runner reference, memory prediction may be incorrect") @@ -448,7 +446,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) { // to avoid scheduling another model on the same GPU(s) that haven't stabilized. // This routine returns the set of GPUs that do not have an active loading model. // If all GPUs have loading models, an empty list will be returned (not a single CPU entry) -func (s *Scheduler) filterGPUsWithLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList { +func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList { ret := append(gpu.GpuInfoList{}, allGpus...) s.loadedMu.Lock() defer s.loadedMu.Unlock() @@ -702,5 +700,4 @@ func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, // TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room return s.findRunnerToUnload() - } diff --git a/server/sched_test.go b/server/sched_test.go index 639da5f9..15184de3 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -156,7 +156,7 @@ func TestRequests(t *testing.T) { // Same model, same request scenario1a := newScenario(t, ctx, "ollama-model-1", 10) - scenario1a.req.sessionDuration = 0 + scenario1a.req.sessionDuration = 5 * time.Millisecond scenario1b := newScenario(t, ctx, "ollama-model-1", 11) scenario1b.req.model = scenario1a.req.model scenario1b.ggml = scenario1a.ggml @@ -167,6 +167,7 @@ func TestRequests(t *testing.T) { tmpModel := *scenario1a.req.model scenario2a.req.model = &tmpModel scenario2a.ggml = scenario1a.ggml + scenario2a.req.sessionDuration = 5 * time.Millisecond // Multiple loaded models scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte) @@ -316,7 +317,6 @@ func TestGetRunner(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() - // Same model, same request scenario1a := newScenario(t, ctx, "ollama-model-1a", 10) scenario1a.req.sessionDuration = 0 scenario1b := newScenario(t, ctx, "ollama-model-1b", 10) @@ -475,6 +475,40 @@ func TestUpdateFreeSpace(t *testing.T) { require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory) } +func TestFilterGPUsWithoutLoadingModels(t *testing.T) { + ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer done() + gpus := gpu.GpuInfoList{ + { + Library: "cuda", + ID: "0", + }, + { + Library: "cuda", + ID: "1", + }, + } + r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true} + + s := InitScheduler(ctx) + s.loadedMu.Lock() + s.loaded["a"] = r1 + s.loadedMu.Unlock() + + tmp := s.filterGPUsWithoutLoadingModels(gpus) + require.Len(t, tmp, 1) + require.Equal(t, "1", tmp[0].ID) + + r1.gpus = gpu.GpuInfoList{gpus[1]} + tmp = s.filterGPUsWithoutLoadingModels(gpus) + require.Len(t, tmp, 1) + require.Equal(t, "0", tmp[0].ID) + + r1.gpus = gpu.GpuInfoList{} + tmp = s.filterGPUsWithoutLoadingModels(gpus) + require.Len(t, tmp, 2) +} + func TestFindRunnerToUnload(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() @@ -607,4 +641,4 @@ func (s *mockLlm) Close() error { } func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } -func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] } +func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }