From dfa2f32ca07dbb6586bf06dd12c82a04c1fee79d Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 5 May 2024 17:18:27 -0700 Subject: [PATCH] unload in critical section (#4187) --- server/sched.go | 8 ++++---- server/sched_test.go | 9 +++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/server/sched.go b/server/sched.go index 9d97c632..164814a3 100644 --- a/server/sched.go +++ b/server/sched.go @@ -116,7 +116,7 @@ func (s *Scheduler) processPending(ctx context.Context) { } } else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners { slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount) - runnerToExpire = s.findRunnerToUnload(pending) + runnerToExpire = s.findRunnerToUnload() } else { // Either no models are loaded or below envconfig.MaxRunners // Get a refreshed GPU list @@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) { s.loadFn(pending, ggml, gpus) break } - runnerToExpire = s.findRunnerToUnload(pending) + runnerToExpire = s.findRunnerToUnload() } if runnerToExpire == nil { @@ -257,9 +257,9 @@ func (s *Scheduler) processCompleted(ctx context.Context) { continue } + s.loadedMu.Lock() slog.Debug("got lock to unload", "model", runner.model) runner.unload() - s.loadedMu.Lock() delete(s.loaded, runner.model) s.loadedMu.Unlock() slog.Debug("runner released", "model", runner.model) @@ -504,7 +504,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu. } // findRunnerToUnload finds a runner to unload to make room for a new model -func (s *Scheduler) findRunnerToUnload(req *LlmRequest) *runnerRef { +func (s *Scheduler) findRunnerToUnload() *runnerRef { s.loadedMu.Lock() runnerList := make([]*runnerRef, 0, len(s.loaded)) for _, r := range s.loaded { diff --git a/server/sched_test.go b/server/sched_test.go index 0e70b843..3e47ed02 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -473,10 +473,7 @@ func TestUpdateFreeSpace(t *testing.T) { func TestFindRunnerToUnload(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) defer done() - req := &LlmRequest{ - ctx: ctx, - opts: api.DefaultOptions(), - } + r1 := &runnerRef{refCount: 1, sessionDuration: 1} r2 := &runnerRef{sessionDuration: 2} @@ -486,10 +483,10 @@ func TestFindRunnerToUnload(t *testing.T) { s.loaded["b"] = r2 s.loadedMu.Unlock() - resp := s.findRunnerToUnload(req) + resp := s.findRunnerToUnload() require.Equal(t, r2, resp) r2.refCount = 1 - resp = s.findRunnerToUnload(req) + resp = s.findRunnerToUnload() require.Equal(t, r1, resp) }