From 2665f3c28e9b94e9fd9a7a31bee849058fe34ccc Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Thu, 26 Oct 2023 20:49:55 -0400 Subject: [PATCH] offload 75% of available vram to improve stability (#921) --- llm/llama.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llm/llama.go b/llm/llama.go index 61288b52..b257f3b8 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -243,12 +243,15 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { return 0 } - // Calculate bytes per layer - // TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size + /* + Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. + We can store the model weights and the kv cache in vram, + to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. + */ bytesPerLayer := fileSizeBytes / numLayer - // max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory - layers := int(freeBytes/bytesPerLayer) * 92 / 100 + // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors + layers := int(freeBytes/bytesPerLayer) * 3 / 4 log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers) return layers