From b05c9e83d91dd25ce578b42aef5947a89283231d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 5 Sep 2024 13:46:35 -0700 Subject: [PATCH] Introduce GPU Overhead env var (#5922) Provide a mechanism for users to set aside an amount of VRAM on each GPU to make room for other applications they want to start after Ollama, or workaround memory prediction bugs --- cmd/cmd.go | 1 + envconfig/config.go | 20 ++++++++++++++++++++ llm/memory.go | 10 +++++++--- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index f6d31f5b..890b839a 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1421,6 +1421,7 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_LLM_LIBRARY"], + envVars["OLLAMA_GPU_OVERHEAD"], }) default: appendEnvDocs(cmd, envs) diff --git a/envconfig/config.go b/envconfig/config.go index 908636a9..b47fd8d5 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -231,6 +231,25 @@ var ( MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0) ) +func Uint64(key string, defaultValue uint64) func() uint64 { + return func() uint64 { + if s := Var(key); s != "" { + if n, err := strconv.ParseUint(s, 10, 64); err != nil { + slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue) + } else { + return n + } + } + + return defaultValue + } +} + +var ( + // Set aside VRAM per GPU + GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0) +) + type EnvVar struct { Name string Value any @@ -241,6 +260,7 @@ func AsMap() map[string]EnvVar { ret := map[string]EnvVar{ "OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"}, + "OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"}, "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"}, "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"}, diff --git a/llm/memory.go b/llm/memory.go index 19b12cbf..185bc8fb 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" ) @@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts // Overflow that didn't fit into the GPU var overflow uint64 + overhead := envconfig.GpuOverhead() availableList := make([]string, len(gpus)) for i, gpu := range gpus { availableList[i] = format.HumanBytes2(gpu.FreeMemory) @@ -164,7 +166,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts gzo = gpuZeroOverhead } // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer - if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { + if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize { slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i]) continue } @@ -196,7 +198,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts for j := len(gpusWithSpace); j > 0; j-- { g := gpusWithSpace[i%j] used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > used+layerSize { + if (g.g.FreeMemory - overhead) > used+layerSize { gpuAllocations[g.i] += layerSize layerCounts[g.i]++ layerCount++ @@ -219,7 +221,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts for j := len(gpusWithSpace); j > 0; j-- { g := gpusWithSpace[layerCount%j] used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) - if g.g.FreeMemory > used+memoryLayerOutput { + if (g.g.FreeMemory - overhead) > used+memoryLayerOutput { gpuAllocations[g.i] += memoryLayerOutput layerCounts[g.i]++ layerCount++ @@ -306,6 +308,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts } func (m MemoryEstimate) log() { + overhead := envconfig.GpuOverhead() slog.Info( "offload to "+m.inferenceLibrary, slog.Group( @@ -323,6 +326,7 @@ func (m MemoryEstimate) log() { "memory", // memory available by GPU for offloading "available", m.availableList, + "gpu_overhead", format.HumanBytes2(overhead), slog.Group( "required", // memory required for full offloading