Refine CPU load behavior with system memory visibility

This commit is contained in:
Daniel Hiltgen 2024-06-03 19:09:23 -07:00
parent 434dfe30c5
commit fc37c192ae
7 changed files with 211 additions and 98 deletions

View file

@ -11,6 +11,8 @@ package gpu
*/ */
import "C" import "C"
import ( import (
"bufio"
"bytes"
"fmt" "fmt"
"log/slog" "log/slog"
"os" "os"
@ -246,6 +248,17 @@ func initOneAPIHandles() *oneapiHandles {
return oHandles return oHandles
} }
func GetCPUInfo() GpuInfoList {
gpuMutex.Lock()
if !bootstrapped {
gpuMutex.Unlock()
GetGPUInfo()
} else {
gpuMutex.Unlock()
}
return GpuInfoList{cpus[0].GpuInfo}
}
func GetGPUInfo() GpuInfoList { func GetGPUInfo() GpuInfoList {
// TODO - consider exploring lspci (and equivalent on windows) to check for // TODO - consider exploring lspci (and equivalent on windows) to check for
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
@ -279,22 +292,19 @@ func GetGPUInfo() GpuInfoList {
needRefresh = false needRefresh = false
cpuCapability = getCPUCapability() cpuCapability = getCPUCapability()
var memInfo C.mem_info_t var memInfo C.mem_info_t
C.cpu_check_ram(&memInfo)
if memInfo.err != nil { mem, err := GetCPUMem()
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err)) if err != nil {
C.free(unsafe.Pointer(memInfo.err)) slog.Warn("error looking up system memory", "error", err)
return []GpuInfo{}
} }
cpuInfo := CPUInfo{ cpus = []CPUInfo{CPUInfo{
GpuInfo: GpuInfo{ GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu", Library: "cpu",
Variant: cpuCapability.ToVariant(), Variant: cpuCapability.ToVariant(),
ID: "0",
}, },
} }}
cpuInfo.TotalMemory = uint64(memInfo.total)
cpuInfo.FreeMemory = uint64(memInfo.free)
cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
cpus = []CPUInfo{cpuInfo}
// Fallback to CPU mode if we're lacking required vector extensions on x86 // Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" { if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
@ -394,7 +404,25 @@ func GetGPUInfo() GpuInfoList {
// Refresh free memory usage // Refresh free memory usage
if needRefresh { if needRefresh {
// TODO - CPU system memory tracking/refresh mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
} else {
slog.Debug("updating system memory data",
slog.Group(
"before",
"total", format.HumanBytes2(cpus[0].TotalMemory),
"free", format.HumanBytes2(cpus[0].FreeMemory),
),
slog.Group(
"now",
"total", format.HumanBytes2(mem.TotalMemory),
"free", format.HumanBytes2(mem.FreeMemory),
),
)
cpus[0].FreeMemory = mem.FreeMemory
}
var memInfo C.mem_info_t var memInfo C.mem_info_t
if cHandles == nil && len(cudaGPUs) > 0 { if cHandles == nil && len(cudaGPUs) > 0 {
cHandles = initCudaHandles() cHandles = initCudaHandles()
@ -455,7 +483,7 @@ func GetGPUInfo() GpuInfoList {
oneapiGPUs[i].FreeMemory = uint64(memInfo.free) oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
} }
err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory() err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
if err != nil { if err != nil {
slog.Debug("problem refreshing ROCm free memory", "error", err) slog.Debug("problem refreshing ROCm free memory", "error", err)
} }
@ -478,6 +506,9 @@ func GetGPUInfo() GpuInfoList {
} }
func GetCPUMem() (memInfo, error) { func GetCPUMem() (memInfo, error) {
if runtime.GOOS == "linux" {
return GetLinuxMemInfo()
}
var ret memInfo var ret memInfo
var info C.mem_info_t var info C.mem_info_t
C.cpu_check_ram(&info) C.cpu_check_ram(&info)
@ -651,3 +682,42 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
return "", "" return "", ""
} }
} }
func GetLinuxMemInfo() (memInfo, error) {
var mem memInfo
var total, available, free, buffers, cached uint64
f, err := os.Open("/proc/meminfo")
if err != nil {
return mem, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
switch {
case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
default:
continue
}
if err != nil {
return mem, err
}
if total > 0 && available > 0 {
mem.TotalMemory = total * 1024
mem.FreeMemory = available * 1024
return mem, nil
}
}
mem.TotalMemory = total * 1024
mem.FreeMemory = (free + buffers + cached) * 1024
return mem, nil
}

View file

@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{info} return []GpuInfo{info}
} }
func GetCPUInfo() GpuInfoList {
mem, _ := GetCPUMem()
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUVariant(),
memInfo: mem,
},
}
}
func GetCPUMem() (memInfo, error) { func GetCPUMem() (memInfo, error) {
return memInfo{ return memInfo{
TotalMemory: uint64(C.getPhysicalMemory()), TotalMemory: uint64(C.getPhysicalMemory()),

View file

@ -35,11 +35,7 @@ void cpu_check_ram(mem_info_t *resp) {
} }
#elif __APPLE__ #elif __APPLE__
// TODO consider an Apple implementation that does something useful // Unused - see gpu_darwin.go
// mem_info_t cpu_check_ram() {
// mem_info_t resp = {0, 0, NULL};
// return resp;
// }
#else #else
#error "Unsupported platform" #error "Unsupported platform"
#endif #endif

View file

@ -11,8 +11,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
char buf[buflen + 1]; char buf[buflen + 1];
int i; int i;
LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
@ -37,13 +35,11 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
} }
// TODO once we've squashed the remaining corner cases remove this log // TODO once we've squashed the remaining corner cases remove this log
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path); // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
LOG(1, "XXX wiring functions nvml_init\n");
for (i = 0; l[i].s != NULL; i++) { for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
@ -58,7 +54,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
return; return;
} }
} }
LOG(1, "XXX calling init_v2\n");
ret = (*resp->ch.nvmlInit_v2)(); ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
@ -69,8 +64,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
LOG(1, "XXX nvml_init done\n");
} }
@ -78,7 +71,6 @@ void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *tot
nvmlDevice_t device; nvmlDevice_t device;
nvmlMemory_t memInfo = {0}; nvmlMemory_t memInfo = {0};
nvmlReturn_t ret; nvmlReturn_t ret;
LOG(1, "XXX in nvml_get_free\n");
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
LOG(1, "unable to get device handle %d: %d", device_id, ret); LOG(1, "unable to get device handle %d: %d", device_id, ret);

View file

@ -37,8 +37,9 @@ type LlamaServer interface {
Tokenize(ctx context.Context, content string) ([]int, error) Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error) Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error Close() error
EstimatedVRAM() uint64 EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64 EstimatedTotal() uint64
EstimagedVRAMByGPU(gpuID string) uint64
} }
// llmServer is an instance of the llama.cpp server // llmServer is an instance of the llama.cpp server
@ -51,7 +52,8 @@ type llmServer struct {
estimate MemoryEstimate estimate MemoryEstimate
totalLayers uint64 totalLayers uint64
gpuCount int // gpuCount int
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
loadDuration time.Duration // Record how long it took the model to load loadDuration time.Duration // Record how long it took the model to load
loadProgress float32 loadProgress float32
@ -80,12 +82,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var cpuRunner string var cpuRunner string
var estimate MemoryEstimate var estimate MemoryEstimate
var systemMemory uint64 var systemMemory uint64
gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 {
gpus = gpu.GetCPUInfo()
}
if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0
estimate = EstimateGPULayers(gpus, ggml, projectors, opts) estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else { } else {
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
@ -107,7 +110,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
case gpus[0].Library != "metal" && estimate.Layers == 0: case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0 gpus = gpu.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers opts.NumGPU = estimate.Layers
} }
@ -246,8 +249,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
if strings.HasPrefix(servers[i], "cpu") { if strings.HasPrefix(servers[i], "cpu") {
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up gpus = gpu.GetCPUInfo()
gpuCount = 0
} }
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race // Find an availableServers port, retry on each iteration in case the failure was a port conflict race
@ -310,7 +312,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
estimate: estimate, estimate: estimate,
sem: semaphore.NewWeighted(int64(numParallel)), sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1, totalLayers: ggml.KV().BlockCount() + 1,
gpuCount: gpuCount, gpus: gpus,
done: make(chan error, 1), done: make(chan error, 1),
} }
@ -1014,6 +1016,15 @@ func (s *llmServer) EstimatedTotal() uint64 {
return s.estimate.TotalSize return s.estimate.TotalSize
} }
func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
for i, gpu := range s.gpus {
if gpu.ID == gpuID {
return s.estimate.GPUSizes[i]
}
}
return 0
}
func parseDurationMs(ms float64) time.Duration { func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil { if err != nil {

View file

@ -7,7 +7,6 @@ import (
"log/slog" "log/slog"
"reflect" "reflect"
"runtime" "runtime"
"slices"
"sort" "sort"
"strings" "strings"
"sync" "sync"
@ -41,6 +40,7 @@ type Scheduler struct {
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
getGpuFn func() gpu.GpuInfoList getGpuFn func() gpu.GpuInfoList
getCpuFn func() gpu.GpuInfoList
} }
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded") var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
@ -54,6 +54,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer, newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo, getGpuFn: gpu.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo,
} }
sched.loadFn = sched.load sched.loadFn = sched.load
return sched return sched
@ -131,7 +132,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
} else { } else {
// Either no models are loaded or below envconfig.MaxRunners // Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list // Get a refreshed GPU list
gpus := s.getGpuFn() var gpus gpu.GpuInfoList
if pending.opts.NumGPU == 0 {
gpus = s.getCpuFn()
} else {
gpus = s.getGpuFn()
}
// Load model for fitting // Load model for fitting
ggml, err := llm.LoadModel(pending.model.ModelPath) ggml, err := llm.LoadModel(pending.model.ModelPath)
@ -140,16 +146,22 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
// If we're CPU only mode, just limit by envconfig.MaxRunners above // Evaluate if the model will fit in the available system memory, or if we should unload a model first
// TODO handle system memory exhaustion if len(gpus) == 1 && gpus[0].Library == "cpu" {
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 { if loadedCount == 0 {
slog.Debug("cpu mode with existing models, loading") slog.Debug("cpu mode with first model, loading")
s.loadFn(pending, ggml, gpus) s.loadFn(pending, ggml, gpus)
break break
} }
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
if runnerToExpire == nil {
slog.Debug("cpu mode with available system memory or first model, loading")
s.loadFn(pending, ggml, gpus)
break
}
// else we need to expire a runner
} else if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit. // No models loaded. Load the model but prefer the best fit.
if loadedCount == 0 {
slog.Debug("loading first model", "model", pending.model.ModelPath) slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFitGPUs(pending, ggml, gpus) g := pickBestFitGPUs(pending, ggml, gpus)
if g != nil { if g != nil {
@ -159,6 +171,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
if runnerToExpire == nil {
// More than one loaded model, so we have to see if the new one fits // More than one loaded model, so we have to see if the new one fits
// Update free memory from currently loaded models // Update free memory from currently loaded models
s.updateFreeSpace(gpus) s.updateFreeSpace(gpus)
@ -170,6 +183,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
runnerToExpire = s.findRunnerToUnload() runnerToExpire = s.findRunnerToUnload()
} }
}
if runnerToExpire == nil { if runnerToExpire == nil {
// Shouildn't happen // Shouildn't happen
@ -368,17 +382,11 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
s.loadedMu.Lock() s.loadedMu.Lock()
for _, r := range s.loaded { for _, r := range s.loaded {
r.refMu.Lock() r.refMu.Lock()
gpuIDs := make([]string, 0, len(r.gpus))
if r.llama != nil { if r.llama != nil {
// TODO this should be broken down by GPU instead of assuming uniform spread
estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
for _, gpu := range r.gpus {
gpuIDs = append(gpuIDs, gpu.ID)
}
for _, gpu := range allGpus { for _, gpu := range allGpus {
if slices.Contains(gpuIDs, gpu.ID) { // if slices.Contains(gpuIDs, gpu.ID) {
predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
} // }
} }
} else { } else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect") slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@ -489,7 +497,8 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
// CPU or Metal don't need checking, so no waiting required // CPU or Metal don't need checking, so no waiting required
// windows can page VRAM, only cuda currently can report accurate used vram usage // windows can page VRAM, only cuda currently can report accurate used vram usage
if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || if len(runner.gpus) == 0 ||
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") { (runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
finished <- struct{}{} finished <- struct{}{}
return finished return finished
@ -624,3 +633,19 @@ func (s *Scheduler) unloadAllRunners() {
} }
} }
} }
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil
}
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
return s.findRunnerToUnload()
}

View file

@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
err := <-req.errCh err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible") require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{estimatedVRAM: 10} server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) { s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
return server, nil return server, nil
} }
@ -146,7 +146,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM} scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return scenario return scenario
} }
@ -182,6 +182,12 @@ func TestRequests(t *testing.T) {
g.FreeMemory = 12 * format.GigaByte g.FreeMemory = 12 * format.GigaByte
return []gpu.GpuInfo{g} return []gpu.GpuInfo{g}
} }
s.getCpuFn = func() gpu.GpuInfoList {
g := gpu.GpuInfo{Library: "cpu"}
g.TotalMemory = 32 * format.GigaByte
g.FreeMemory = 26 * format.GigaByte
return []gpu.GpuInfo{g}
}
s.newServerFn = scenario1a.newServer s.newServerFn = scenario1a.newServer
slog.Info("scenario1a") slog.Info("scenario1a")
s.pendingReqCh <- scenario1a.req s.pendingReqCh <- scenario1a.req
@ -420,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) {
sessionDuration: 2, sessionDuration: 2,
} }
finished := make(chan *LlmRequest) finished := make(chan *LlmRequest)
llm1 := &mockLlm{} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, sessionDuration: 1} r1 := &runnerRef{llama: llm1, sessionDuration: 1}
req.useLoadedRunner(r1, finished) req.useLoadedRunner(r1, finished)
require.Equal(t, uint(1), r1.refCount) require.Equal(t, uint(1), r1.refCount)
@ -453,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) {
gpus[0].FreeMemory = 900 gpus[0].FreeMemory = 900
gpus[1].TotalMemory = 2000 gpus[1].TotalMemory = 2000
gpus[1].FreeMemory = 1900 gpus[1].FreeMemory = 1900
llm1 := &mockLlm{estimatedVRAM: 100} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{estimatedVRAM: 200} llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
r1 := &runnerRef{llama: llm1, gpus: gpus} r1 := &runnerRef{llama: llm1, gpus: gpus}
r2 := &runnerRef{llama: llm2, gpus: gpus} r2 := &runnerRef{llama: llm2, gpus: gpus}
@ -465,8 +471,8 @@ func TestUpdateFreeSpace(t *testing.T) {
s.loadedMu.Unlock() s.loadedMu.Unlock()
s.updateFreeSpace(gpus) s.updateFreeSpace(gpus)
require.Equal(t, uint64(850), gpus[0].FreeMemory) require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
require.Equal(t, uint64(1850), gpus[1].FreeMemory) require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
} }
func TestFindRunnerToUnload(t *testing.T) { func TestFindRunnerToUnload(t *testing.T) {
@ -493,7 +499,7 @@ func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
llm := &mockLlm{} llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
do := api.DefaultOptions() do := api.DefaultOptions()
runner := &runnerRef{ runner := &runnerRef{
model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}}, model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
@ -536,8 +542,8 @@ func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
llm1 := &mockLlm{} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm2 := &mockLlm{} llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.unloadAllRunners() s.unloadAllRunners()
@ -555,7 +561,7 @@ func TestUnloadAllRunners(t *testing.T) {
} }
func TestUnload(t *testing.T) { func TestUnload(t *testing.T) {
llm1 := &mockLlm{} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1} r1 := &runnerRef{llama: llm1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}} r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
r1.unload() r1.unload()
@ -578,6 +584,7 @@ type mockLlm struct {
closeCalled bool closeCalled bool
estimatedVRAM uint64 estimatedVRAM uint64
estimatedTotal uint64 estimatedTotal uint64
estimatedVRAMByGPU map[string]uint64
} }
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp } func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
@ -600,3 +607,4 @@ func (s *mockLlm) Close() error {
} }
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }