Refine CPU load behavior with system memory visibility
This commit is contained in:
parent
434dfe30c5
commit
fc37c192ae
96
gpu/gpu.go
96
gpu/gpu.go
|
@ -11,6 +11,8 @@ package gpu
|
||||||
*/
|
*/
|
||||||
import "C"
|
import "C"
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
@ -246,6 +248,17 @@ func initOneAPIHandles() *oneapiHandles {
|
||||||
return oHandles
|
return oHandles
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetCPUInfo() GpuInfoList {
|
||||||
|
gpuMutex.Lock()
|
||||||
|
if !bootstrapped {
|
||||||
|
gpuMutex.Unlock()
|
||||||
|
GetGPUInfo()
|
||||||
|
} else {
|
||||||
|
gpuMutex.Unlock()
|
||||||
|
}
|
||||||
|
return GpuInfoList{cpus[0].GpuInfo}
|
||||||
|
}
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfoList {
|
func GetGPUInfo() GpuInfoList {
|
||||||
// TODO - consider exploring lspci (and equivalent on windows) to check for
|
// TODO - consider exploring lspci (and equivalent on windows) to check for
|
||||||
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
|
||||||
|
@ -279,22 +292,19 @@ func GetGPUInfo() GpuInfoList {
|
||||||
needRefresh = false
|
needRefresh = false
|
||||||
cpuCapability = getCPUCapability()
|
cpuCapability = getCPUCapability()
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
C.cpu_check_ram(&memInfo)
|
|
||||||
if memInfo.err != nil {
|
mem, err := GetCPUMem()
|
||||||
slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
|
if err != nil {
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
slog.Warn("error looking up system memory", "error", err)
|
||||||
return []GpuInfo{}
|
|
||||||
}
|
}
|
||||||
cpuInfo := CPUInfo{
|
cpus = []CPUInfo{CPUInfo{
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
|
memInfo: mem,
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: cpuCapability.ToVariant(),
|
Variant: cpuCapability.ToVariant(),
|
||||||
|
ID: "0",
|
||||||
},
|
},
|
||||||
}
|
}}
|
||||||
cpuInfo.TotalMemory = uint64(memInfo.total)
|
|
||||||
cpuInfo.FreeMemory = uint64(memInfo.free)
|
|
||||||
cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
|
||||||
cpus = []CPUInfo{cpuInfo}
|
|
||||||
|
|
||||||
// Fallback to CPU mode if we're lacking required vector extensions on x86
|
// Fallback to CPU mode if we're lacking required vector extensions on x86
|
||||||
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
|
if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
|
||||||
|
@ -394,7 +404,25 @@ func GetGPUInfo() GpuInfoList {
|
||||||
|
|
||||||
// Refresh free memory usage
|
// Refresh free memory usage
|
||||||
if needRefresh {
|
if needRefresh {
|
||||||
// TODO - CPU system memory tracking/refresh
|
mem, err := GetCPUMem()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("error looking up system memory", "error", err)
|
||||||
|
} else {
|
||||||
|
slog.Debug("updating system memory data",
|
||||||
|
slog.Group(
|
||||||
|
"before",
|
||||||
|
"total", format.HumanBytes2(cpus[0].TotalMemory),
|
||||||
|
"free", format.HumanBytes2(cpus[0].FreeMemory),
|
||||||
|
),
|
||||||
|
slog.Group(
|
||||||
|
"now",
|
||||||
|
"total", format.HumanBytes2(mem.TotalMemory),
|
||||||
|
"free", format.HumanBytes2(mem.FreeMemory),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
cpus[0].FreeMemory = mem.FreeMemory
|
||||||
|
}
|
||||||
|
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
if cHandles == nil && len(cudaGPUs) > 0 {
|
if cHandles == nil && len(cudaGPUs) > 0 {
|
||||||
cHandles = initCudaHandles()
|
cHandles = initCudaHandles()
|
||||||
|
@ -455,7 +483,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
|
oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
|
||||||
}
|
}
|
||||||
|
|
||||||
err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
|
err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Debug("problem refreshing ROCm free memory", "error", err)
|
slog.Debug("problem refreshing ROCm free memory", "error", err)
|
||||||
}
|
}
|
||||||
|
@ -478,6 +506,9 @@ func GetGPUInfo() GpuInfoList {
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
|
return GetLinuxMemInfo()
|
||||||
|
}
|
||||||
var ret memInfo
|
var ret memInfo
|
||||||
var info C.mem_info_t
|
var info C.mem_info_t
|
||||||
C.cpu_check_ram(&info)
|
C.cpu_check_ram(&info)
|
||||||
|
@ -651,3 +682,42 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
||||||
return "", ""
|
return "", ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetLinuxMemInfo() (memInfo, error) {
|
||||||
|
var mem memInfo
|
||||||
|
var total, available, free, buffers, cached uint64
|
||||||
|
f, err := os.Open("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
s := bufio.NewScanner(f)
|
||||||
|
for s.Scan() {
|
||||||
|
switch {
|
||||||
|
case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
|
||||||
|
_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
|
||||||
|
case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
|
||||||
|
_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
|
||||||
|
case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
|
||||||
|
_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
|
||||||
|
case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
|
||||||
|
_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
|
||||||
|
case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
|
||||||
|
_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
|
||||||
|
default:
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if total > 0 && available > 0 {
|
||||||
|
mem.TotalMemory = total * 1024
|
||||||
|
mem.FreeMemory = available * 1024
|
||||||
|
return mem, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mem.TotalMemory = total * 1024
|
||||||
|
mem.FreeMemory = (free + buffers + cached) * 1024
|
||||||
|
return mem, nil
|
||||||
|
}
|
||||||
|
|
|
@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
|
||||||
return []GpuInfo{info}
|
return []GpuInfo{info}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func GetCPUInfo() GpuInfoList {
|
||||||
|
mem, _ := GetCPUMem()
|
||||||
|
return []GpuInfo{
|
||||||
|
{
|
||||||
|
Library: "cpu",
|
||||||
|
Variant: GetCPUVariant(),
|
||||||
|
memInfo: mem,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
return memInfo{
|
return memInfo{
|
||||||
TotalMemory: uint64(C.getPhysicalMemory()),
|
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||||
|
|
|
@ -35,11 +35,7 @@ void cpu_check_ram(mem_info_t *resp) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif __APPLE__
|
#elif __APPLE__
|
||||||
// TODO consider an Apple implementation that does something useful
|
// Unused - see gpu_darwin.go
|
||||||
// mem_info_t cpu_check_ram() {
|
|
||||||
// mem_info_t resp = {0, 0, NULL};
|
|
||||||
// return resp;
|
|
||||||
// }
|
|
||||||
#else
|
#else
|
||||||
#error "Unsupported platform"
|
#error "Unsupported platform"
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -11,8 +11,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
|
|
||||||
|
|
||||||
struct lookup {
|
struct lookup {
|
||||||
char *s;
|
char *s;
|
||||||
void **p;
|
void **p;
|
||||||
|
@ -37,13 +35,11 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
|
// LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
|
||||||
|
|
||||||
LOG(1, "XXX wiring functions nvml_init\n");
|
|
||||||
|
|
||||||
for (i = 0; l[i].s != NULL; i++) {
|
for (i = 0; l[i].s != NULL; i++) {
|
||||||
// TODO once we've squashed the remaining corner cases remove this log
|
// TODO once we've squashed the remaining corner cases remove this log
|
||||||
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
// LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
|
||||||
|
|
||||||
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
|
||||||
if (!l[i].p) {
|
if (!l[i].p) {
|
||||||
|
@ -58,7 +54,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG(1, "XXX calling init_v2\n");
|
|
||||||
|
|
||||||
ret = (*resp->ch.nvmlInit_v2)();
|
ret = (*resp->ch.nvmlInit_v2)();
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
|
@ -69,8 +64,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
LOG(1, "XXX nvml_init done\n");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,7 +71,6 @@ void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *tot
|
||||||
nvmlDevice_t device;
|
nvmlDevice_t device;
|
||||||
nvmlMemory_t memInfo = {0};
|
nvmlMemory_t memInfo = {0};
|
||||||
nvmlReturn_t ret;
|
nvmlReturn_t ret;
|
||||||
LOG(1, "XXX in nvml_get_free\n");
|
|
||||||
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
|
ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
LOG(1, "unable to get device handle %d: %d", device_id, ret);
|
LOG(1, "unable to get device handle %d: %d", device_id, ret);
|
||||||
|
|
|
@ -37,8 +37,9 @@ type LlamaServer interface {
|
||||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||||
Close() error
|
Close() error
|
||||||
EstimatedVRAM() uint64
|
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
||||||
EstimatedTotal() uint64
|
EstimatedTotal() uint64
|
||||||
|
EstimagedVRAMByGPU(gpuID string) uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// llmServer is an instance of the llama.cpp server
|
// llmServer is an instance of the llama.cpp server
|
||||||
|
@ -51,7 +52,8 @@ type llmServer struct {
|
||||||
|
|
||||||
estimate MemoryEstimate
|
estimate MemoryEstimate
|
||||||
totalLayers uint64
|
totalLayers uint64
|
||||||
gpuCount int
|
// gpuCount int
|
||||||
|
gpus gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
|
||||||
loadDuration time.Duration // Record how long it took the model to load
|
loadDuration time.Duration // Record how long it took the model to load
|
||||||
loadProgress float32
|
loadProgress float32
|
||||||
|
|
||||||
|
@ -80,12 +82,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
var cpuRunner string
|
var cpuRunner string
|
||||||
var estimate MemoryEstimate
|
var estimate MemoryEstimate
|
||||||
var systemMemory uint64
|
var systemMemory uint64
|
||||||
gpuCount := len(gpus)
|
|
||||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
|
||||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
|
||||||
|
|
||||||
|
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||||
|
if opts.NumGPU == 0 {
|
||||||
|
gpus = gpu.GetCPUInfo()
|
||||||
|
}
|
||||||
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
cpuRunner = serverForCpu()
|
cpuRunner = serverForCpu()
|
||||||
gpuCount = 0
|
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
} else {
|
} else {
|
||||||
if gpus[0].Library == "metal" {
|
if gpus[0].Library == "metal" {
|
||||||
|
@ -107,7 +110,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||||
// Don't bother loading into the GPU if no layers can fit
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
cpuRunner = serverForCpu()
|
cpuRunner = serverForCpu()
|
||||||
gpuCount = 0
|
gpus = gpu.GetCPUInfo()
|
||||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||||
opts.NumGPU = estimate.Layers
|
opts.NumGPU = estimate.Layers
|
||||||
}
|
}
|
||||||
|
@ -246,8 +249,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.HasPrefix(servers[i], "cpu") {
|
if strings.HasPrefix(servers[i], "cpu") {
|
||||||
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
|
gpus = gpu.GetCPUInfo()
|
||||||
gpuCount = 0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
// Find an availableServers port, retry on each iteration in case the failure was a port conflict race
|
||||||
|
@ -310,7 +312,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
estimate: estimate,
|
estimate: estimate,
|
||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||||
totalLayers: ggml.KV().BlockCount() + 1,
|
totalLayers: ggml.KV().BlockCount() + 1,
|
||||||
gpuCount: gpuCount,
|
gpus: gpus,
|
||||||
done: make(chan error, 1),
|
done: make(chan error, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1014,6 +1016,15 @@ func (s *llmServer) EstimatedTotal() uint64 {
|
||||||
return s.estimate.TotalSize
|
return s.estimate.TotalSize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
|
||||||
|
for i, gpu := range s.gpus {
|
||||||
|
if gpu.ID == gpuID {
|
||||||
|
return s.estimate.GPUSizes[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func parseDurationMs(ms float64) time.Duration {
|
func parseDurationMs(ms float64) time.Duration {
|
||||||
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -7,7 +7,6 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"reflect"
|
"reflect"
|
||||||
"runtime"
|
"runtime"
|
||||||
"slices"
|
|
||||||
"sort"
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -41,6 +40,7 @@ type Scheduler struct {
|
||||||
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
||||||
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
|
newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
|
||||||
getGpuFn func() gpu.GpuInfoList
|
getGpuFn func() gpu.GpuInfoList
|
||||||
|
getCpuFn func() gpu.GpuInfoList
|
||||||
}
|
}
|
||||||
|
|
||||||
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||||
|
@ -54,6 +54,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: llm.NewLlamaServer,
|
newServerFn: llm.NewLlamaServer,
|
||||||
getGpuFn: gpu.GetGPUInfo,
|
getGpuFn: gpu.GetGPUInfo,
|
||||||
|
getCpuFn: gpu.GetCPUInfo,
|
||||||
}
|
}
|
||||||
sched.loadFn = sched.load
|
sched.loadFn = sched.load
|
||||||
return sched
|
return sched
|
||||||
|
@ -131,7 +132,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
} else {
|
} else {
|
||||||
// Either no models are loaded or below envconfig.MaxRunners
|
// Either no models are loaded or below envconfig.MaxRunners
|
||||||
// Get a refreshed GPU list
|
// Get a refreshed GPU list
|
||||||
gpus := s.getGpuFn()
|
var gpus gpu.GpuInfoList
|
||||||
|
if pending.opts.NumGPU == 0 {
|
||||||
|
gpus = s.getCpuFn()
|
||||||
|
} else {
|
||||||
|
gpus = s.getGpuFn()
|
||||||
|
}
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
||||||
|
@ -140,16 +146,22 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're CPU only mode, just limit by envconfig.MaxRunners above
|
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||||
// TODO handle system memory exhaustion
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
|
if loadedCount == 0 {
|
||||||
slog.Debug("cpu mode with existing models, loading")
|
slog.Debug("cpu mode with first model, loading")
|
||||||
s.loadFn(pending, ggml, gpus)
|
s.loadFn(pending, ggml, gpus)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
|
||||||
|
if runnerToExpire == nil {
|
||||||
|
slog.Debug("cpu mode with available system memory or first model, loading")
|
||||||
|
s.loadFn(pending, ggml, gpus)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// else we need to expire a runner
|
||||||
|
} else if loadedCount == 0 {
|
||||||
// No models loaded. Load the model but prefer the best fit.
|
// No models loaded. Load the model but prefer the best fit.
|
||||||
if loadedCount == 0 {
|
|
||||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||||
g := pickBestFitGPUs(pending, ggml, gpus)
|
g := pickBestFitGPUs(pending, ggml, gpus)
|
||||||
if g != nil {
|
if g != nil {
|
||||||
|
@ -159,6 +171,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if runnerToExpire == nil {
|
||||||
// More than one loaded model, so we have to see if the new one fits
|
// More than one loaded model, so we have to see if the new one fits
|
||||||
// Update free memory from currently loaded models
|
// Update free memory from currently loaded models
|
||||||
s.updateFreeSpace(gpus)
|
s.updateFreeSpace(gpus)
|
||||||
|
@ -170,6 +183,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
}
|
}
|
||||||
runnerToExpire = s.findRunnerToUnload()
|
runnerToExpire = s.findRunnerToUnload()
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if runnerToExpire == nil {
|
if runnerToExpire == nil {
|
||||||
// Shouildn't happen
|
// Shouildn't happen
|
||||||
|
@ -368,17 +382,11 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
for _, r := range s.loaded {
|
for _, r := range s.loaded {
|
||||||
r.refMu.Lock()
|
r.refMu.Lock()
|
||||||
gpuIDs := make([]string, 0, len(r.gpus))
|
|
||||||
if r.llama != nil {
|
if r.llama != nil {
|
||||||
// TODO this should be broken down by GPU instead of assuming uniform spread
|
|
||||||
estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
|
|
||||||
for _, gpu := range r.gpus {
|
|
||||||
gpuIDs = append(gpuIDs, gpu.ID)
|
|
||||||
}
|
|
||||||
for _, gpu := range allGpus {
|
for _, gpu := range allGpus {
|
||||||
if slices.Contains(gpuIDs, gpu.ID) {
|
// if slices.Contains(gpuIDs, gpu.ID) {
|
||||||
predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
|
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
||||||
|
@ -489,7 +497,8 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
||||||
|
|
||||||
// CPU or Metal don't need checking, so no waiting required
|
// CPU or Metal don't need checking, so no waiting required
|
||||||
// windows can page VRAM, only cuda currently can report accurate used vram usage
|
// windows can page VRAM, only cuda currently can report accurate used vram usage
|
||||||
if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
|
if len(runner.gpus) == 0 ||
|
||||||
|
(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
|
||||||
(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
|
(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
|
||||||
finished <- struct{}{}
|
finished <- struct{}{}
|
||||||
return finished
|
return finished
|
||||||
|
@ -624,3 +633,19 @@ func (s *Scheduler) unloadAllRunners() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||||
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||||
|
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
||||||
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
||||||
|
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
||||||
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|
||||||
|
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
|
||||||
|
|
||||||
|
return s.findRunnerToUnload()
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
|
||||||
err := <-req.errCh
|
err := <-req.errCh
|
||||||
require.Contains(t, err.Error(), "this model may be incompatible")
|
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||||
|
|
||||||
server := &mockLlm{estimatedVRAM: 10}
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
|
||||||
return server, nil
|
return server, nil
|
||||||
}
|
}
|
||||||
|
@ -146,7 +146,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
||||||
successCh: make(chan *runnerRef, 1),
|
successCh: make(chan *runnerRef, 1),
|
||||||
errCh: make(chan error, 1),
|
errCh: make(chan error, 1),
|
||||||
}
|
}
|
||||||
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
|
scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
|
||||||
return scenario
|
return scenario
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -182,6 +182,12 @@ func TestRequests(t *testing.T) {
|
||||||
g.FreeMemory = 12 * format.GigaByte
|
g.FreeMemory = 12 * format.GigaByte
|
||||||
return []gpu.GpuInfo{g}
|
return []gpu.GpuInfo{g}
|
||||||
}
|
}
|
||||||
|
s.getCpuFn = func() gpu.GpuInfoList {
|
||||||
|
g := gpu.GpuInfo{Library: "cpu"}
|
||||||
|
g.TotalMemory = 32 * format.GigaByte
|
||||||
|
g.FreeMemory = 26 * format.GigaByte
|
||||||
|
return []gpu.GpuInfo{g}
|
||||||
|
}
|
||||||
s.newServerFn = scenario1a.newServer
|
s.newServerFn = scenario1a.newServer
|
||||||
slog.Info("scenario1a")
|
slog.Info("scenario1a")
|
||||||
s.pendingReqCh <- scenario1a.req
|
s.pendingReqCh <- scenario1a.req
|
||||||
|
@ -420,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) {
|
||||||
sessionDuration: 2,
|
sessionDuration: 2,
|
||||||
}
|
}
|
||||||
finished := make(chan *LlmRequest)
|
finished := make(chan *LlmRequest)
|
||||||
llm1 := &mockLlm{}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
r1 := &runnerRef{llama: llm1, sessionDuration: 1}
|
r1 := &runnerRef{llama: llm1, sessionDuration: 1}
|
||||||
req.useLoadedRunner(r1, finished)
|
req.useLoadedRunner(r1, finished)
|
||||||
require.Equal(t, uint(1), r1.refCount)
|
require.Equal(t, uint(1), r1.refCount)
|
||||||
|
@ -453,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||||
gpus[0].FreeMemory = 900
|
gpus[0].FreeMemory = 900
|
||||||
gpus[1].TotalMemory = 2000
|
gpus[1].TotalMemory = 2000
|
||||||
gpus[1].FreeMemory = 1900
|
gpus[1].FreeMemory = 1900
|
||||||
llm1 := &mockLlm{estimatedVRAM: 100}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
|
||||||
llm2 := &mockLlm{estimatedVRAM: 200}
|
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
|
||||||
r1 := &runnerRef{llama: llm1, gpus: gpus}
|
r1 := &runnerRef{llama: llm1, gpus: gpus}
|
||||||
r2 := &runnerRef{llama: llm2, gpus: gpus}
|
r2 := &runnerRef{llama: llm2, gpus: gpus}
|
||||||
|
|
||||||
|
@ -465,8 +471,8 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
|
|
||||||
s.updateFreeSpace(gpus)
|
s.updateFreeSpace(gpus)
|
||||||
require.Equal(t, uint64(850), gpus[0].FreeMemory)
|
require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
|
||||||
require.Equal(t, uint64(1850), gpus[1].FreeMemory)
|
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFindRunnerToUnload(t *testing.T) {
|
func TestFindRunnerToUnload(t *testing.T) {
|
||||||
|
@ -493,7 +499,7 @@ func TestNeedsReload(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
llm := &mockLlm{}
|
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
do := api.DefaultOptions()
|
do := api.DefaultOptions()
|
||||||
runner := &runnerRef{
|
runner := &runnerRef{
|
||||||
model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
|
model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
|
||||||
|
@ -536,8 +542,8 @@ func TestUnloadAllRunners(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
|
|
||||||
llm1 := &mockLlm{}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
llm2 := &mockLlm{}
|
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.unloadAllRunners()
|
s.unloadAllRunners()
|
||||||
|
|
||||||
|
@ -555,7 +561,7 @@ func TestUnloadAllRunners(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUnload(t *testing.T) {
|
func TestUnload(t *testing.T) {
|
||||||
llm1 := &mockLlm{}
|
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
r1 := &runnerRef{llama: llm1}
|
r1 := &runnerRef{llama: llm1}
|
||||||
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
|
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
|
||||||
r1.unload()
|
r1.unload()
|
||||||
|
@ -578,6 +584,7 @@ type mockLlm struct {
|
||||||
closeCalled bool
|
closeCalled bool
|
||||||
estimatedVRAM uint64
|
estimatedVRAM uint64
|
||||||
estimatedTotal uint64
|
estimatedTotal uint64
|
||||||
|
estimatedVRAMByGPU map[string]uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
|
||||||
|
@ -600,3 +607,4 @@ func (s *mockLlm) Close() error {
|
||||||
}
|
}
|
||||||
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
||||||
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
||||||
|
func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|
||||||
|
|
Loading…
Reference in a new issue