Refine CPU load behavior with system memory visibility

2024-06-03 19:09:23 -07:00 · 2024-06-03 19:09:23 -07:00 · fc37c192ae
parent 434dfe30c5
commit fc37c192ae
7 changed files with 211 additions and 98 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -11,6 +11,8 @@ package gpu
 */
 import "C"
 import (
 	"bufio"
 	"bytes"
 	"fmt"
 	"log/slog"
 	"os"
@ -246,6 +248,17 @@ func initOneAPIHandles() *oneapiHandles {
 	return oHandles
 }
 func GetCPUInfo() GpuInfoList {
 	gpuMutex.Lock()
 	if !bootstrapped {
 		gpuMutex.Unlock()
 		GetGPUInfo()
 	} else {
 		gpuMutex.Unlock()
 	}
 	return GpuInfoList{cpus[0].GpuInfo}
 }
 func GetGPUInfo() GpuInfoList {
 	// TODO - consider exploring lspci (and equivalent on windows) to check for
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
@ -279,22 +292,19 @@ func GetGPUInfo() GpuInfoList {
 		needRefresh = false
 		cpuCapability = getCPUCapability()
 		var memInfo C.mem_info_t
-		C.cpu_check_ram(&memInfo)
+
-		if memInfo.err != nil {
+		mem, err := GetCPUMem()
-			slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
+		if err != nil {
-			C.free(unsafe.Pointer(memInfo.err))
+			slog.Warn("error looking up system memory", "error", err)
 			return []GpuInfo{}
 		}
-		cpuInfo := CPUInfo{
+		cpus = []CPUInfo{CPUInfo{
 			GpuInfo: GpuInfo{
 				memInfo: mem,
 				Library: "cpu",
 				Variant: cpuCapability.ToVariant(),
 				ID:      "0",
 			},
-		}
+		}}
 		cpuInfo.TotalMemory = uint64(memInfo.total)
 		cpuInfo.FreeMemory = uint64(memInfo.free)
 		cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 		cpus = []CPUInfo{cpuInfo}
 		// Fallback to CPU mode if we're lacking required vector extensions on x86
 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
@ -394,7 +404,25 @@ func GetGPUInfo() GpuInfoList {
 	// Refresh free memory usage
 	if needRefresh {
-		// TODO - CPU system memory tracking/refresh
+		mem, err := GetCPUMem()
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		} else {
 			slog.Debug("updating system memory data",
 				slog.Group(
 					"before",
 					"total", format.HumanBytes2(cpus[0].TotalMemory),
 					"free", format.HumanBytes2(cpus[0].FreeMemory),
 				),
 				slog.Group(
 					"now",
 					"total", format.HumanBytes2(mem.TotalMemory),
 					"free", format.HumanBytes2(mem.FreeMemory),
 				),
 			)
 			cpus[0].FreeMemory = mem.FreeMemory
 		}
 		var memInfo C.mem_info_t
 		if cHandles == nil && len(cudaGPUs) > 0 {
 			cHandles = initCudaHandles()
@ -455,7 +483,7 @@ func GetGPUInfo() GpuInfoList {
 			oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
 		}
-		err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
+		err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
 		if err != nil {
 			slog.Debug("problem refreshing ROCm free memory", "error", err)
 		}
@ -478,6 +506,9 @@ func GetGPUInfo() GpuInfoList {
 }
 func GetCPUMem() (memInfo, error) {
 	if runtime.GOOS == "linux" {
 		return GetLinuxMemInfo()
 	}
 	var ret memInfo
 	var info C.mem_info_t
 	C.cpu_check_ram(&info)
@ -651,3 +682,42 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
 func GetLinuxMemInfo() (memInfo, error) {
 	var mem memInfo
 	var total, available, free, buffers, cached uint64
 	f, err := os.Open("/proc/meminfo")
 	if err != nil {
 		return mem, err
 	}
 	defer f.Close()
 	s := bufio.NewScanner(f)
 	for s.Scan() {
 		switch {
 		case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
 			_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
 		case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
 			_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
 		case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
 			_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
 		case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
 			_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
 		case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
 			_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
 		default:
 			continue
 		}
 		if err != nil {
 			return mem, err
 		}
 		if total > 0 && available > 0 {
 			mem.TotalMemory = total * 1024
 			mem.FreeMemory = available * 1024
 			return mem, nil
 		}
 	}
 	mem.TotalMemory = total * 1024
 	mem.FreeMemory = (free + buffers + cached) * 1024
 	return mem, nil
 }
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
 	return []GpuInfo{info}
 }
 func GetCPUInfo() GpuInfoList {
 	mem, _ := GetCPUMem()
 	return []GpuInfo{
 		{
 			Library: "cpu",
 			Variant: GetCPUVariant(),
 			memInfo: mem,
 		},
 	}
 }
 func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@ -35,11 +35,7 @@ void cpu_check_ram(mem_info_t *resp) {
 }
 #elif __APPLE__
-// TODO consider an Apple implementation that does something useful
+// Unused - see gpu_darwin.go
 // mem_info_t cpu_check_ram() {
 //   mem_info_t resp = {0, 0, NULL};
 //   return resp;
 // }
 #else
 #error "Unsupported platform"
 #endif
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@ -11,8 +11,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  char buf[buflen + 1];
  int i;
  LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
  struct lookup {
    char *s;
    void **p;
@ -37,13 +35,11 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  }
  // TODO once we've squashed the remaining corner cases remove this log
-//   LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
    LOG(1, "XXX wiring functions nvml_init\n");
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
@ -58,7 +54,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
      return;
    }
  }
    LOG(1, "XXX calling init_v2\n");
  ret = (*resp->ch.nvmlInit_v2)();
  if (ret != NVML_SUCCESS) {
@ -69,8 +64,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
    resp->err = strdup(buf);
    return;
  }
      LOG(1, "XXX nvml_init done\n");
 }
@ -78,7 +71,6 @@ void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *tot
    nvmlDevice_t device;
    nvmlMemory_t memInfo = {0};
    nvmlReturn_t ret;
    LOG(1, "XXX in nvml_get_free\n");
    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
    if (ret != NVML_SUCCESS) {
        LOG(1, "unable to get device handle %d: %d", device_id, ret);
--- a/llm/server.go
+++ b/llm/server.go
@ -37,8 +37,9 @@ type LlamaServer interface {
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
-	EstimatedVRAM() uint64
+	EstimatedVRAM() uint64 // Total VRAM across all GPUs
 	EstimatedTotal() uint64
 	EstimagedVRAMByGPU(gpuID string) uint64
 }
 // llmServer is an instance of the llama.cpp server
@ -51,7 +52,8 @@ type llmServer struct {
 	estimate    MemoryEstimate
 	totalLayers uint64
-	gpuCount     int
+	// gpuCount     int
 	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
 	loadDuration time.Duration   // Record how long it took the model to load
 	loadProgress float32
@ -80,12 +82,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var cpuRunner string
 	var estimate MemoryEstimate
 	var systemMemory uint64
 	gpuCount := len(gpus)
 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
 		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
 		gpus = gpu.GetCPUInfo()
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = serverForCpu()
 		gpuCount = 0
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		if gpus[0].Library == "metal" {
@ -107,7 +110,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = serverForCpu()
-			gpuCount = 0
+			gpus = gpu.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
 		}
@ -246,8 +249,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 		if strings.HasPrefix(servers[i], "cpu") {
-			// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
+			gpus = gpu.GetCPUInfo()
 			gpuCount = 0
 		}
 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
@ -310,7 +312,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			estimate:    estimate,
 			sem:         semaphore.NewWeighted(int64(numParallel)),
 			totalLayers: ggml.KV().BlockCount() + 1,
-			gpuCount:    gpuCount,
+			gpus:        gpus,
 			done:        make(chan error, 1),
 		}
@ -1014,6 +1016,15 @@ func (s *llmServer) EstimatedTotal() uint64 {
 	return s.estimate.TotalSize
 }
 func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
 	for i, gpu := range s.gpus {
 		if gpu.ID == gpuID {
 			return s.estimate.GPUSizes[i]
 		}
 	}
 	return 0
 }
 func parseDurationMs(ms float64) time.Duration {
 	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
 	if err != nil {
--- a/server/sched.go
+++ b/server/sched.go
@ -7,7 +7,6 @@ import (
 	"log/slog"
 	"reflect"
 	"runtime"
 	"slices"
 	"sort"
 	"strings"
 	"sync"
@ -41,6 +40,7 @@ type Scheduler struct {
 	loadFn      func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
 	newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
 	getGpuFn    func() gpu.GpuInfoList
 	getCpuFn    func() gpu.GpuInfoList
 }
 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
@ -54,6 +54,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		getGpuFn:      gpu.GetGPUInfo,
 		getCpuFn:      gpu.GetCPUInfo,
 	}
 	sched.loadFn = sched.load
 	return sched
@ -131,7 +132,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				} else {
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
-					gpus := s.getGpuFn()
+					var gpus gpu.GpuInfoList
 					if pending.opts.NumGPU == 0 {
 						gpus = s.getCpuFn()
 					} else {
 						gpus = s.getGpuFn()
 					}
 					// Load model for fitting
 					ggml, err := llm.LoadModel(pending.model.ModelPath)
@ -140,16 +146,22 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
-					// If we're CPU only mode, just limit by envconfig.MaxRunners above
+					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
-					// TODO handle system memory exhaustion
+					if len(gpus) == 1 && gpus[0].Library == "cpu" {
-					if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
+						if loadedCount == 0 {
-						slog.Debug("cpu mode with existing models, loading")
+							slog.Debug("cpu mode with first model, loading")
 							s.loadFn(pending, ggml, gpus)
 							break
 						}
-
+						runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
 						if runnerToExpire == nil {
 							slog.Debug("cpu mode with available system memory or first model, loading")
 							s.loadFn(pending, ggml, gpus)
 							break
 						}
 						// else we need to expire a runner
 					} else if loadedCount == 0 {
 						// No models loaded. Load the model but prefer the best fit.
 					if loadedCount == 0 {
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
 						g := pickBestFitGPUs(pending, ggml, gpus)
 						if g != nil {
@ -159,6 +171,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 					if runnerToExpire == nil {
 						// More than one loaded model, so we have to see if the new one fits
 						// Update free memory from currently loaded models
 						s.updateFreeSpace(gpus)
@ -170,6 +183,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						}
 						runnerToExpire = s.findRunnerToUnload()
 					}
 				}
 				if runnerToExpire == nil {
 					// Shouildn't happen
@ -368,17 +382,11 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 	s.loadedMu.Lock()
 	for _, r := range s.loaded {
 		r.refMu.Lock()
 		gpuIDs := make([]string, 0, len(r.gpus))
 		if r.llama != nil {
 			// TODO this should be broken down by GPU instead of assuming uniform spread
 			estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
 			for _, gpu := range r.gpus {
 				gpuIDs = append(gpuIDs, gpu.ID)
 			}
 			for _, gpu := range allGpus {
-				if slices.Contains(gpuIDs, gpu.ID) {
+				// if slices.Contains(gpuIDs, gpu.ID) {
-					predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
+				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
-				}
+				// }
 			}
 		} else {
 			slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@ -489,7 +497,8 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	// CPU or Metal don't need checking, so no waiting required
 	// windows can page VRAM, only cuda currently can report accurate used vram usage
-	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
+	if len(runner.gpus) == 0 ||
 		(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
 		(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
 		finished <- struct{}{}
 		return finished
@ -624,3 +633,19 @@ func (s *Scheduler) unloadAllRunners() {
 		}
 	}
 }
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
 func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
 	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
 		return nil
 	}
 	// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
 	return s.findRunnerToUnload()
 }
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
 	err := <-req.errCh
 	require.Contains(t, err.Error(), "this model may be incompatible")
-	server := &mockLlm{estimatedVRAM: 10}
+	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
 		return server, nil
 	}
@ -146,7 +146,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 	}
-	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
+	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return scenario
 }
@ -182,6 +182,12 @@ func TestRequests(t *testing.T) {
 		g.FreeMemory = 12 * format.GigaByte
 		return []gpu.GpuInfo{g}
 	}
 	s.getCpuFn = func() gpu.GpuInfoList {
 		g := gpu.GpuInfo{Library: "cpu"}
 		g.TotalMemory = 32 * format.GigaByte
 		g.FreeMemory = 26 * format.GigaByte
 		return []gpu.GpuInfo{g}
 	}
 	s.newServerFn = scenario1a.newServer
 	slog.Info("scenario1a")
 	s.pendingReqCh <- scenario1a.req
@ -420,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) {
 		sessionDuration: 2,
 	}
 	finished := make(chan *LlmRequest)
-	llm1 := &mockLlm{}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	r1 := &runnerRef{llama: llm1, sessionDuration: 1}
 	req.useLoadedRunner(r1, finished)
 	require.Equal(t, uint(1), r1.refCount)
@ -453,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 	gpus[0].FreeMemory = 900
 	gpus[1].TotalMemory = 2000
 	gpus[1].FreeMemory = 1900
-	llm1 := &mockLlm{estimatedVRAM: 100}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
-	llm2 := &mockLlm{estimatedVRAM: 200}
+	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
 	r1 := &runnerRef{llama: llm1, gpus: gpus}
 	r2 := &runnerRef{llama: llm2, gpus: gpus}
@ -465,8 +471,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 	s.loadedMu.Unlock()
 	s.updateFreeSpace(gpus)
-	require.Equal(t, uint64(850), gpus[0].FreeMemory)
+	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
-	require.Equal(t, uint64(1850), gpus[1].FreeMemory)
+	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
 }
 func TestFindRunnerToUnload(t *testing.T) {
@ -493,7 +499,7 @@ func TestNeedsReload(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
-	llm := &mockLlm{}
+	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	do := api.DefaultOptions()
 	runner := &runnerRef{
 		model:   &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
@ -536,8 +542,8 @@ func TestUnloadAllRunners(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
-	llm1 := &mockLlm{}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
-	llm2 := &mockLlm{}
+	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	s := InitScheduler(ctx)
 	s.unloadAllRunners()
@ -555,7 +561,7 @@ func TestUnloadAllRunners(t *testing.T) {
 }
 func TestUnload(t *testing.T) {
-	llm1 := &mockLlm{}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	r1 := &runnerRef{llama: llm1}
 	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
 	r1.unload()
@ -578,6 +584,7 @@ type mockLlm struct {
 	closeCalled        bool
 	estimatedVRAM      uint64
 	estimatedTotal     uint64
 	estimatedVRAMByGPU map[string]uint64
 }
 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
@ -600,3 +607,4 @@ func (s *mockLlm) Close() error {
 }
 func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
 func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
 func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }