Merge pull request #4299 from dhiltgen/handle_vram_reporting_lag

Wait for GPU free memory reporting to converge
This commit is contained in:
Daniel Hiltgen 2024-05-09 15:08:56 -07:00 committed by GitHub
commit 83d6d46e29
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 61 additions and 3 deletions

View file

@ -8,14 +8,14 @@ import (
func GetCPUVariant() string {
if cpu.X86.HasAVX2 {
slog.Info("CPU has AVX2")
slog.Debug("CPU has AVX2")
return "avx2"
}
if cpu.X86.HasAVX {
slog.Info("CPU has AVX")
slog.Debug("CPU has AVX")
return "avx"
}
slog.Info("CPU does not have vector extensions")
slog.Debug("CPU does not have vector extensions")
// else LCD
return ""
}

View file

@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
s.loadedMu.Lock()
slog.Debug("got lock to unload", "model", runner.model)
finished := runner.waitForVRAMRecovery()
runner.unload()
delete(s.loaded, runner.model)
s.loadedMu.Unlock()
slog.Debug("runner released", "model", runner.model)
runner.refMu.Unlock()
<-finished
slog.Debug("sending an unloaded event", "model", runner.model)
s.unloadedCh <- struct{}{}
}
@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
return false
}
// Free memory reporting on GPUs can lag for a while even after the runner
// exits, so we have to keep checking until we see the available memory recover,
// otherwise subsequent model loads will get far less layers loaded or worse
// case, may completely fall back to CPU mode.
// This routine must be called before the runner unloads so it can establish
// a before and after GPU memory allocation. The returned channel
// will be notified when we're done waiting, or have timed out and should
// proceed anyway
func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
finished := make(chan interface{}, 1)
// CPU or Metal don't need checking, so no waiting required
if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
finished <- struct{}{}
return finished
}
start := time.Now()
// Establish a baseline before we unload
gpusBefore := gpu.GetGPUInfo()
var totalMemoryBefore, freeMemoryBefore uint64
for _, gpu := range gpusBefore {
totalMemoryBefore += gpu.TotalMemory
freeMemoryBefore += gpu.FreeMemory
}
go func() {
expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
ticker := time.NewTicker(250 * time.Millisecond)
defer ticker.Stop()
for {
<-ticker.C
if time.Now().After(expiresAt) {
slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
finished <- struct{}{}
}
// Query GPUs, look for free to go back up
gpusNow := gpu.GetGPUInfo()
var totalMemoryNow, freeMemoryNow uint64
for _, gpu := range gpusNow {
totalMemoryNow += gpu.TotalMemory
freeMemoryNow += gpu.FreeMemory
}
// If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
finished <- struct{}{}
return
}
}
}()
return finished
}
type ByDuration []*runnerRef
func (a ByDuration) Len() int { return len(a) }