From c4014e73a25488b3f1488b96e82b578d8261993f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 10 May 2024 15:09:48 -0700 Subject: [PATCH] Fall back to CPU runner with zero layers --- llm/server.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llm/server.go b/llm/server.go index 81a2dec4..33c56f1f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -105,6 +105,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system opts.NumGPU = 0 + } else if gpus[0].Library != "metal" && layers == 0 { + // Don't bother loading into the GPU if no layers can fit + cpuRunner = serverForCpu() + gpuCount = 0 } else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" { opts.NumGPU = layers }