Refine CPU load behavior with system memory visibility

This commit is contained in:
Daniel Hiltgen 2024-06-03 19:09:23 -07:00
parent 434dfe30c5
commit fc37c192ae
7 changed files with 211 additions and 98 deletions

View file

@ -11,6 +11,8 @@ package gpu
*/ */
import "C" import "C"
import ( import (
"bufio"
"bytes"
"fmt" "fmt"
"log/slog" "log/slog"
"os" "os"
@ -246,6 +248,17 @@ func initOneAPIHandles() *oneapiHandles {
return oHandles return oHandles
} }
func GetCPUInfo() GpuInfoList {
gpuMutex.Lock()
if !bootstrapped {
gpuMutex.Unlock()
GetGPUInfo()
} else {
gpuMutex.Unlock()
}
return GpuInfoList{cpus[0].GpuInfo}
}
func GetGPUInfo() GpuInfoList { func GetGPUInfo() GpuInfoList {
// TODO - consider exploring lspci (and equivalent on windows) to check for // TODO - consider exploring lspci (and equivalent on windows) to check for
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
@ -279,22 +292,19 @@ func GetGPUInfo() GpuInfoList {
needRefresh = false needRefresh = false
cpuCapability = getCPUCapability() cpuCapability = getCPUCapability()
var memInfo C.mem_info_t var memInfo C.mem_info_t
C.cpu_check_ram(&memInfo)
if memInfo.err != nil { mem, err := GetCPUMem()
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err)) if err != nil {
C.free(unsafe.Pointer(memInfo.err)) slog.Warn("error looking up system memory", "error", err)
return []GpuInfo{}
} }
cpuInfo := CPUInfo{ cpus = []CPUInfo{CPUInfo{
GpuInfo: GpuInfo{ GpuInfo: GpuInfo{
memInfo: mem,
Library: "cpu", Library: "cpu",
Variant: cpuCapability.ToVariant(), Variant: cpuCapability.ToVariant(),
ID: "0",
}, },
} }}
cpuInfo.TotalMemory = uint64(memInfo.total)
cpuInfo.FreeMemory = uint64(memInfo.free)
cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
cpus = []CPUInfo{cpuInfo}
// Fallback to CPU mode if we're lacking required vector extensions on x86 // Fallback to CPU mode if we're lacking required vector extensions on x86
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" { if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
@ -394,7 +404,25 @@ func GetGPUInfo() GpuInfoList {
// Refresh free memory usage // Refresh free memory usage
if needRefresh { if needRefresh {
// TODO - CPU system memory tracking/refresh mem, err := GetCPUMem()
if err != nil {
slog.Warn("error looking up system memory", "error", err)
} else {
slog.Debug("updating system memory data",
slog.Group(
"before",
"total", format.HumanBytes2(cpus[0].TotalMemory),
"free", format.HumanBytes2(cpus[0].FreeMemory),
),
slog.Group(
"now",
"total", format.HumanBytes2(mem.TotalMemory),
"free", format.HumanBytes2(mem.FreeMemory),
),
)
cpus[0].FreeMemory = mem.FreeMemory
}
var memInfo C.mem_info_t var memInfo C.mem_info_t
if cHandles == nil && len(cudaGPUs) > 0 { if cHandles == nil && len(cudaGPUs) > 0 {
cHandles = initCudaHandles() cHandles = initCudaHandles()
@ -455,7 +483,7 @@ func GetGPUInfo() GpuInfoList {
oneapiGPUs[i].FreeMemory = uint64(memInfo.free) oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
} }
err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory() err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
if err != nil { if err != nil {
slog.Debug("problem refreshing ROCm free memory", "error", err) slog.Debug("problem refreshing ROCm free memory", "error", err)
} }
@ -478,6 +506,9 @@ func GetGPUInfo() GpuInfoList {
} }
func GetCPUMem() (memInfo, error) { func GetCPUMem() (memInfo, error) {
if runtime.GOOS == "linux" {
return GetLinuxMemInfo()
}
var ret memInfo var ret memInfo
var info C.mem_info_t var info C.mem_info_t
C.cpu_check_ram(&info) C.cpu_check_ram(&info)
@ -651,3 +682,42 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
return "", "" return "", ""
} }
} }
func GetLinuxMemInfo() (memInfo, error) {
var mem memInfo
var total, available, free, buffers, cached uint64
f, err := os.Open("/proc/meminfo")
if err != nil {
return mem, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
switch {
case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
default:
continue
}
if err != nil {
return mem, err
}
if total > 0 && available > 0 {
mem.TotalMemory = total * 1024
mem.FreeMemory = available * 1024
return mem, nil
}
}
mem.TotalMemory = total * 1024
mem.FreeMemory = (free + buffers + cached) * 1024
return mem, nil
}

View file

@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{info} return []GpuInfo{info}
} }
func GetCPUInfo() GpuInfoList {
mem, _ := GetCPUMem()
return []GpuInfo{
{
Library: "cpu",
Variant: GetCPUVariant(),
memInfo: mem,
},
}
}
func GetCPUMem() (memInfo, error) { func GetCPUMem() (memInfo, error) {
return memInfo{ return memInfo{
TotalMemory: uint64(C.getPhysicalMemory()), TotalMemory: uint64(C.getPhysicalMemory()),

View file

@ -35,11 +35,7 @@ void cpu_check_ram(mem_info_t *resp) {
} }
#elif __APPLE__ #elif __APPLE__
// TODO consider an Apple implementation that does something useful // Unused - see gpu_darwin.go
// mem_info_t cpu_check_ram() {
// mem_info_t resp = {0, 0, NULL};
// return resp;
// }
#else #else
#error "Unsupported platform" #error "Unsupported platform"
#endif #endif

View file

@ -11,8 +11,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
char buf[buflen + 1]; char buf[buflen + 1];
int i; int i;
LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
struct lookup { struct lookup {
char *s; char *s;
void **p; void **p;
@ -37,13 +35,11 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
} }
// TODO once we've squashed the remaining corner cases remove this log // TODO once we've squashed the remaining corner cases remove this log
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path); // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
LOG(1, "XXX wiring functions nvml_init\n");
for (i = 0; l[i].s != NULL; i++) { for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log // TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s); // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) { if (!l[i].p) {
@ -58,7 +54,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
return; return;
} }
} }
LOG(1, "XXX calling init_v2\n");
ret = (*resp->ch.nvmlInit_v2)(); ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
@ -69,8 +64,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
resp->err = strdup(buf); resp->err = strdup(buf);
return; return;
} }
LOG(1, "XXX nvml_init done\n");
} }
@ -78,7 +71,6 @@ void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *tot
nvmlDevice_t device; nvmlDevice_t device;
nvmlMemory_t memInfo = {0}; nvmlMemory_t memInfo = {0};
nvmlReturn_t ret; nvmlReturn_t ret;
LOG(1, "XXX in nvml_get_free\n");
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device); ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
if (ret != NVML_SUCCESS) { if (ret != NVML_SUCCESS) {
LOG(1, "unable to get device handle %d: %d", device_id, ret); LOG(1, "unable to get device handle %d: %d", device_id, ret);

View file

@ -37,8 +37,9 @@ type LlamaServer interface {
Tokenize(ctx context.Context, content string) ([]int, error) Tokenize(ctx context.Context, content string) ([]int, error)
Detokenize(ctx context.Context, tokens []int) (string, error) Detokenize(ctx context.Context, tokens []int) (string, error)
Close() error Close() error
EstimatedVRAM() uint64 EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64 EstimatedTotal() uint64
EstimagedVRAMByGPU(gpuID string) uint64
} }
// llmServer is an instance of the llama.cpp server // llmServer is an instance of the llama.cpp server
@ -49,10 +50,11 @@ type llmServer struct {
status *StatusWriter status *StatusWriter
options api.Options options api.Options
estimate MemoryEstimate estimate MemoryEstimate
totalLayers uint64 totalLayers uint64
gpuCount int // gpuCount int
loadDuration time.Duration // Record how long it took the model to load gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
loadDuration time.Duration // Record how long it took the model to load
loadProgress float32 loadProgress float32
sem *semaphore.Weighted sem *semaphore.Weighted
@ -80,12 +82,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
var cpuRunner string var cpuRunner string
var estimate MemoryEstimate var estimate MemoryEstimate
var systemMemory uint64 var systemMemory uint64
gpuCount := len(gpus)
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
if opts.NumGPU == 0 {
gpus = gpu.GetCPUInfo()
}
if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0
estimate = EstimateGPULayers(gpus, ggml, projectors, opts) estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else { } else {
if gpus[0].Library == "metal" { if gpus[0].Library == "metal" {
@ -107,7 +110,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
case gpus[0].Library != "metal" && estimate.Layers == 0: case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
cpuRunner = serverForCpu() cpuRunner = serverForCpu()
gpuCount = 0 gpus = gpu.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers opts.NumGPU = estimate.Layers
} }
@ -246,8 +249,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
} }
if strings.HasPrefix(servers[i], "cpu") { if strings.HasPrefix(servers[i], "cpu") {
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up gpus = gpu.GetCPUInfo()
gpuCount = 0
} }
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race // Find an availableServers port, retry on each iteration in case the failure was a port conflict race
@ -310,7 +312,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
estimate: estimate, estimate: estimate,
sem: semaphore.NewWeighted(int64(numParallel)), sem: semaphore.NewWeighted(int64(numParallel)),
totalLayers: ggml.KV().BlockCount() + 1, totalLayers: ggml.KV().BlockCount() + 1,
gpuCount: gpuCount, gpus: gpus,
done: make(chan error, 1), done: make(chan error, 1),
} }
@ -1014,6 +1016,15 @@ func (s *llmServer) EstimatedTotal() uint64 {
return s.estimate.TotalSize return s.estimate.TotalSize
} }
func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
for i, gpu := range s.gpus {
if gpu.ID == gpuID {
return s.estimate.GPUSizes[i]
}
}
return 0
}
func parseDurationMs(ms float64) time.Duration { func parseDurationMs(ms float64) time.Duration {
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms)) dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
if err != nil { if err != nil {

View file

@ -7,7 +7,6 @@ import (
"log/slog" "log/slog"
"reflect" "reflect"
"runtime" "runtime"
"slices"
"sort" "sort"
"strings" "strings"
"sync" "sync"
@ -41,6 +40,7 @@ type Scheduler struct {
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
getGpuFn func() gpu.GpuInfoList getGpuFn func() gpu.GpuInfoList
getCpuFn func() gpu.GpuInfoList
} }
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded") var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
@ -54,6 +54,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
loaded: make(map[string]*runnerRef), loaded: make(map[string]*runnerRef),
newServerFn: llm.NewLlamaServer, newServerFn: llm.NewLlamaServer,
getGpuFn: gpu.GetGPUInfo, getGpuFn: gpu.GetGPUInfo,
getCpuFn: gpu.GetCPUInfo,
} }
sched.loadFn = sched.load sched.loadFn = sched.load
return sched return sched
@ -131,7 +132,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
} else { } else {
// Either no models are loaded or below envconfig.MaxRunners // Either no models are loaded or below envconfig.MaxRunners
// Get a refreshed GPU list // Get a refreshed GPU list
gpus := s.getGpuFn() var gpus gpu.GpuInfoList
if pending.opts.NumGPU == 0 {
gpus = s.getCpuFn()
} else {
gpus = s.getGpuFn()
}
// Load model for fitting // Load model for fitting
ggml, err := llm.LoadModel(pending.model.ModelPath) ggml, err := llm.LoadModel(pending.model.ModelPath)
@ -140,16 +146,22 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
// If we're CPU only mode, just limit by envconfig.MaxRunners above // Evaluate if the model will fit in the available system memory, or if we should unload a model first
// TODO handle system memory exhaustion if len(gpus) == 1 && gpus[0].Library == "cpu" {
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 { if loadedCount == 0 {
slog.Debug("cpu mode with existing models, loading") slog.Debug("cpu mode with first model, loading")
s.loadFn(pending, ggml, gpus) s.loadFn(pending, ggml, gpus)
break break
} }
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
// No models loaded. Load the model but prefer the best fit. if runnerToExpire == nil {
if loadedCount == 0 { slog.Debug("cpu mode with available system memory or first model, loading")
s.loadFn(pending, ggml, gpus)
break
}
// else we need to expire a runner
} else if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath) slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFitGPUs(pending, ggml, gpus) g := pickBestFitGPUs(pending, ggml, gpus)
if g != nil { if g != nil {
@ -159,16 +171,18 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
// More than one loaded model, so we have to see if the new one fits if runnerToExpire == nil {
// Update free memory from currently loaded models // More than one loaded model, so we have to see if the new one fits
s.updateFreeSpace(gpus) // Update free memory from currently loaded models
gpus = pickBestFitGPUs(pending, ggml, gpus) s.updateFreeSpace(gpus)
if gpus != nil { gpus = pickBestFitGPUs(pending, ggml, gpus)
slog.Debug("new model fits with existing models, loading") if gpus != nil {
s.loadFn(pending, ggml, gpus) slog.Debug("new model fits with existing models, loading")
break s.loadFn(pending, ggml, gpus)
break
}
runnerToExpire = s.findRunnerToUnload()
} }
runnerToExpire = s.findRunnerToUnload()
} }
if runnerToExpire == nil { if runnerToExpire == nil {
@ -368,17 +382,11 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
s.loadedMu.Lock() s.loadedMu.Lock()
for _, r := range s.loaded { for _, r := range s.loaded {
r.refMu.Lock() r.refMu.Lock()
gpuIDs := make([]string, 0, len(r.gpus))
if r.llama != nil { if r.llama != nil {
// TODO this should be broken down by GPU instead of assuming uniform spread
estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
for _, gpu := range r.gpus {
gpuIDs = append(gpuIDs, gpu.ID)
}
for _, gpu := range allGpus { for _, gpu := range allGpus {
if slices.Contains(gpuIDs, gpu.ID) { // if slices.Contains(gpuIDs, gpu.ID) {
predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
} // }
} }
} else { } else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect") slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@ -489,7 +497,8 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
// CPU or Metal don't need checking, so no waiting required // CPU or Metal don't need checking, so no waiting required
// windows can page VRAM, only cuda currently can report accurate used vram usage // windows can page VRAM, only cuda currently can report accurate used vram usage
if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || if len(runner.gpus) == 0 ||
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") { (runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
finished <- struct{}{} finished <- struct{}{}
return finished return finished
@ -624,3 +633,19 @@ func (s *Scheduler) unloadAllRunners() {
} }
} }
} }
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil
}
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
return s.findRunnerToUnload()
}

View file

@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
err := <-req.errCh err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible") require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{estimatedVRAM: 10} server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) { s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
return server, nil return server, nil
} }
@ -146,7 +146,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM} scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
return scenario return scenario
} }
@ -182,6 +182,12 @@ func TestRequests(t *testing.T) {
g.FreeMemory = 12 * format.GigaByte g.FreeMemory = 12 * format.GigaByte
return []gpu.GpuInfo{g} return []gpu.GpuInfo{g}
} }
s.getCpuFn = func() gpu.GpuInfoList {
g := gpu.GpuInfo{Library: "cpu"}
g.TotalMemory = 32 * format.GigaByte
g.FreeMemory = 26 * format.GigaByte
return []gpu.GpuInfo{g}
}
s.newServerFn = scenario1a.newServer s.newServerFn = scenario1a.newServer
slog.Info("scenario1a") slog.Info("scenario1a")
s.pendingReqCh <- scenario1a.req s.pendingReqCh <- scenario1a.req
@ -420,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) {
sessionDuration: 2, sessionDuration: 2,
} }
finished := make(chan *LlmRequest) finished := make(chan *LlmRequest)
llm1 := &mockLlm{} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, sessionDuration: 1} r1 := &runnerRef{llama: llm1, sessionDuration: 1}
req.useLoadedRunner(r1, finished) req.useLoadedRunner(r1, finished)
require.Equal(t, uint(1), r1.refCount) require.Equal(t, uint(1), r1.refCount)
@ -453,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) {
gpus[0].FreeMemory = 900 gpus[0].FreeMemory = 900
gpus[1].TotalMemory = 2000 gpus[1].TotalMemory = 2000
gpus[1].FreeMemory = 1900 gpus[1].FreeMemory = 1900
llm1 := &mockLlm{estimatedVRAM: 100} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{estimatedVRAM: 200} llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
r1 := &runnerRef{llama: llm1, gpus: gpus} r1 := &runnerRef{llama: llm1, gpus: gpus}
r2 := &runnerRef{llama: llm2, gpus: gpus} r2 := &runnerRef{llama: llm2, gpus: gpus}
@ -465,8 +471,8 @@ func TestUpdateFreeSpace(t *testing.T) {
s.loadedMu.Unlock() s.loadedMu.Unlock()
s.updateFreeSpace(gpus) s.updateFreeSpace(gpus)
require.Equal(t, uint64(850), gpus[0].FreeMemory) require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
require.Equal(t, uint64(1850), gpus[1].FreeMemory) require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
} }
func TestFindRunnerToUnload(t *testing.T) { func TestFindRunnerToUnload(t *testing.T) {
@ -493,7 +499,7 @@ func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
llm := &mockLlm{} llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
do := api.DefaultOptions() do := api.DefaultOptions()
runner := &runnerRef{ runner := &runnerRef{
model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}}, model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
@ -536,8 +542,8 @@ func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done() defer done()
llm1 := &mockLlm{} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
llm2 := &mockLlm{} llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.unloadAllRunners() s.unloadAllRunners()
@ -555,7 +561,7 @@ func TestUnloadAllRunners(t *testing.T) {
} }
func TestUnload(t *testing.T) { func TestUnload(t *testing.T) {
llm1 := &mockLlm{} llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1} r1 := &runnerRef{llama: llm1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}} r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
r1.unload() r1.unload()
@ -565,19 +571,20 @@ func TestUnload(t *testing.T) {
} }
type mockLlm struct { type mockLlm struct {
pingResp error pingResp error
waitResp error waitResp error
completionResp error completionResp error
embeddingResp []float64 embeddingResp []float64
embeddingRespErr error embeddingRespErr error
tokenizeResp []int tokenizeResp []int
tokenizeRespErr error tokenizeRespErr error
detokenizeResp string detokenizeResp string
detonekizeRespErr error detonekizeRespErr error
closeResp error closeResp error
closeCalled bool closeCalled bool
estimatedVRAM uint64 estimatedVRAM uint64
estimatedTotal uint64 estimatedTotal uint64
estimatedVRAMByGPU map[string]uint64
} }
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp } func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
@ -598,5 +605,6 @@ func (s *mockLlm) Close() error {
s.closeCalled = true s.closeCalled = true
return s.closeResp return s.closeResp
} }
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }