Restructure loading conditional chain
This commit is contained in:
parent
ceb0e26e5e
commit
36a6daccab
|
@ -123,36 +123,35 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
} else if loadedCount == 0 {
|
|
||||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
|
||||||
gpus := s.getGpuFn()
|
|
||||||
|
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
|
||||||
if err != nil {
|
|
||||||
pending.errCh <- err
|
|
||||||
break
|
|
||||||
}
|
|
||||||
g := pickBestFitGPUs(pending, ggml, gpus)
|
|
||||||
if g != nil {
|
|
||||||
gpus = g
|
|
||||||
}
|
|
||||||
s.loadFn(pending, ggml, gpus)
|
|
||||||
break
|
|
||||||
} else if loadedMax > 0 && loadedCount >= loadedMax {
|
} else if loadedMax > 0 && loadedCount >= loadedMax {
|
||||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||||
runnerToExpire = s.findRunnerToUnload(pending)
|
runnerToExpire = s.findRunnerToUnload(pending)
|
||||||
} else {
|
} else {
|
||||||
// More than one loaded model, so we have to see if the new one fits
|
// Either no models are loaded or below loadedMax
|
||||||
// Get a refreshed GPU list
|
// Get a refreshed GPU list
|
||||||
gpus := s.getGpuFn()
|
gpus := s.getGpuFn()
|
||||||
// Update free memory from currently loaded models
|
|
||||||
s.updateFreeSpace(gpus)
|
|
||||||
|
|
||||||
|
// Load model for fitting
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
pending.errCh <- err
|
pending.errCh <- err
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// No models loaded. Load the model but prefer the best fit.
|
||||||
|
if loadedCount == 0 {
|
||||||
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||||
|
g := pickBestFitGPUs(pending, ggml, gpus)
|
||||||
|
if g != nil {
|
||||||
|
gpus = g
|
||||||
|
}
|
||||||
|
s.loadFn(pending, ggml, gpus)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// More than one loaded model, so we have to see if the new one fits
|
||||||
|
// Update free memory from currently loaded models
|
||||||
|
s.updateFreeSpace(gpus)
|
||||||
gpus = pickBestFitGPUs(pending, ggml, gpus)
|
gpus = pickBestFitGPUs(pending, ggml, gpus)
|
||||||
if gpus != nil {
|
if gpus != nil {
|
||||||
slog.Debug("new model fits with existing models, loading")
|
slog.Debug("new model fits with existing models, loading")
|
||||||
|
|
|
@ -47,7 +47,7 @@ func TestLoad(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
|
||||||
defer done()
|
defer done()
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
ggml := nil // value not used in tests
|
var ggml *llm.GGML // value not used in tests
|
||||||
req := &LlmRequest{
|
req := &LlmRequest{
|
||||||
ctx: ctx,
|
ctx: ctx,
|
||||||
model: &Model{ModelPath: "foo"},
|
model: &Model{ModelPath: "foo"},
|
||||||
|
|
Loading…
Reference in a new issue