Centralize server config handling

This moves all the env var reading into one central module and logs the loaded config once at startup which should help in troubleshooting user server logs
2024-05-04 11:46:01 -07:00 · 2024-05-04 11:46:01 -07:00 · f56aa20014
parent 6707768ebd
commit f56aa20014
12 changed files with 235 additions and 162 deletions
--- a/app/lifecycle/logging.go
+++ b/app/lifecycle/logging.go
@ -5,12 +5,14 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
 	"github.com/ollama/ollama/server/envconfig"
 )
 func InitLogging() {
 	level := slog.LevelInfo
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		level = slog.LevelDebug
 	}
--- a/app/lifecycle/updater_windows.go
+++ b/app/lifecycle/updater_windows.go
@ -31,16 +31,13 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
 		"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
 		"/FORCECLOSEAPPLICATIONS",               // Force close the tray app - might be needed
 	}
-	// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
+	// make the upgrade as quiet as possible (no GUI, no prompts)
 	// TODO - temporarily disable since we're pinning in debug mode for the preview
 	// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
 	installArgs = append(installArgs,
 		"/SP", // Skip the "This will install... Do you wish to continue" prompt
 		"/SUPPRESSMSGBOXES",
 		"/SILENT",
 		"/VERYSILENT",
 	)
 	// }
 	// Safeguard in case we have requests in flight that need to drain...
 	slog.Info("Waiting for server to shutdown")
--- a/gpu/assets.go
+++ b/gpu/assets.go
@ -12,6 +12,8 @@ import (
 	"sync"
 	"syscall"
 	"time"
 	"github.com/ollama/ollama/server/envconfig"
 )
 var (
@ -24,45 +26,8 @@ func PayloadsDir() (string, error) {
 	defer lock.Unlock()
 	var err error
 	if payloadsDir == "" {
-		runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
+		runnersDir := envconfig.RunnersDir
 		// On Windows we do not carry the payloads inside the main executable
 		if runtime.GOOS == "windows" && runnersDir == "" {
 			appExe, err := os.Executable()
 			if err != nil {
 				slog.Error("failed to lookup executable path", "error", err)
 				return "", err
 			}
 			cwd, err := os.Getwd()
 			if err != nil {
 				slog.Error("failed to lookup working directory", "error", err)
 				return "", err
 			}
 			var paths []string
 			for _, root := range []string{filepath.Dir(appExe), cwd} {
 				paths = append(paths,
 					filepath.Join(root),
 					filepath.Join(root, "windows-"+runtime.GOARCH),
 					filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
 				)
 			}
 			// Try a few variations to improve developer experience when building from source in the local tree
 			for _, p := range paths {
 				candidate := filepath.Join(p, "ollama_runners")
 				_, err := os.Stat(candidate)
 				if err == nil {
 					runnersDir = candidate
 					break
 				}
 			}
 			if runnersDir == "" {
 				err = fmt.Errorf("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
 				slog.Error("incomplete distribution", "error", err)
 				return "", err
 			}
 		}
 		if runnersDir != "" {
 			payloadsDir = runnersDir
 			return payloadsDir, nil
@ -70,7 +35,7 @@ func PayloadsDir() (string, error) {
 		// The remainder only applies on non-windows where we still carry payloads in the main executable
 		cleanupTmpDirs()
-		tmpDir := os.Getenv("OLLAMA_TMPDIR")
+		tmpDir := envconfig.TmpDir
 		if tmpDir == "" {
 			tmpDir, err = os.MkdirTemp("", "ollama")
 			if err != nil {
@ -133,7 +98,7 @@ func cleanupTmpDirs() {
 func Cleanup() {
 	lock.Lock()
 	defer lock.Unlock()
-	runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
+	runnersDir := envconfig.RunnersDir
 	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
 		// We want to fully clean up the tmpdir parent of the payloads dir
 		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -21,6 +21,7 @@ import (
 	"unsafe"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/server/envconfig"
 )
 type handles struct {
@ -268,7 +269,7 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
 }
 func getVerboseState() C.uint16_t {
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		return C.uint16_t(1)
 	}
 	return C.uint16_t(0)
--- a/llm/memory.go
+++ b/llm/memory.go
@ -3,12 +3,11 @@ package llm
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"strconv"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/server/envconfig"
 )
 // This algorithm looks for a complete fit to determine if we need to unload other models
@ -50,15 +49,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	for _, info := range gpus {
 		memoryAvailable += info.FreeMemory
 	}
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
+	if envconfig.MaxVRAM > 0 {
-	if userLimit != "" {
+		memoryAvailable = envconfig.MaxVRAM
 		avail, err := strconv.ParseUint(userLimit, 10, 64)
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
 		} else {
 			slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
 			memoryAvailable = avail
 		}
 	}
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
--- a/llm/server.go
+++ b/llm/server.go
@ -26,6 +26,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/server/envconfig"
 )
 type LlamaServer interface {
@ -124,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	} else {
 		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
-	demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
+	demandLib := envconfig.LLMLibrary
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {
@ -145,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--embedding",
 	}
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		params = append(params, "--log-format", "json")
 	} else {
 		params = append(params, "--log-disable")
@ -155,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		params = append(params, "--verbose")
 	}
@ -194,15 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	}
 	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
-	numParallel := 1
+	numParallel := envconfig.NumParallel
 	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
 		numParallel, err = strconv.Atoi(onp)
 		if err != nil || numParallel <= 0 {
 			err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
 			slog.Error("misconfiguration", "error", err)
 			return nil, err
 		}
 	}
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
 	for i := 0; i < len(servers); i++ {
--- a/server/envconfig/config.go
+++ b/server/envconfig/config.go
@ -0,0 +1,174 @@
 package envconfig
 import (
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"strconv"
 	"strings"
 )
 var (
 	// Set via OLLAMA_ORIGINS in the environment
 	AllowOrigins []string
 	// Set via OLLAMA_DEBUG in the environment
 	Debug bool
 	// Set via OLLAMA_LLM_LIBRARY in the environment
 	LLMLibrary string
 	// Set via OLLAMA_MAX_LOADED_MODELS in the environment
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
 	// Set via OLLAMA_MAX_VRAM in the environment
 	MaxVRAM uint64
 	// Set via OLLAMA_NOPRUNE in the environment
 	NoPrune bool
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
 )
 func AsMap() map[string]string {
 	return map[string]string{
 		"OLLAMA_ORIGINS":           fmt.Sprintf("%v", AllowOrigins),
 		"OLLAMA_DEBUG":             fmt.Sprintf("%v", Debug),
 		"OLLAMA_LLM_LIBRARY":       fmt.Sprintf("%v", LLMLibrary),
 		"OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners),
 		"OLLAMA_MAX_QUEUE":         fmt.Sprintf("%v", MaxQueuedRequests),
 		"OLLAMA_MAX_VRAM":          fmt.Sprintf("%v", MaxVRAM),
 		"OLLAMA_NOPRUNE":           fmt.Sprintf("%v", NoPrune),
 		"OLLAMA_NUM_PARALLEL":      fmt.Sprintf("%v", NumParallel),
 		"OLLAMA_RUNNERS_DIR":       fmt.Sprintf("%v", RunnersDir),
 		"OLLAMA_TMPDIR":            fmt.Sprintf("%v", TmpDir),
 	}
 }
 var defaultAllowOrigins = []string{
 	"localhost",
 	"127.0.0.1",
 	"0.0.0.0",
 }
 // Clean quotes and spaces from the value
 func clean(key string) string {
 	return strings.Trim(os.Getenv(key), "\"' ")
 }
 func init() {
 	// default values
 	NumParallel = 1
 	MaxRunners = 1
 	MaxQueuedRequests = 512
 	LoadConfig()
 }
 func LoadConfig() {
 	if debug := clean("OLLAMA_DEBUG"); debug != "" {
 		d, err := strconv.ParseBool(debug)
 		if err == nil {
 			Debug = d
 		} else {
 			Debug = true
 		}
 	}
 	RunnersDir = clean("OLLAMA_RUNNERS_DIR")
 	if runtime.GOOS == "windows" && RunnersDir == "" {
 		// On Windows we do not carry the payloads inside the main executable
 		appExe, err := os.Executable()
 		if err != nil {
 			slog.Error("failed to lookup executable path", "error", err)
 		}
 		cwd, err := os.Getwd()
 		if err != nil {
 			slog.Error("failed to lookup working directory", "error", err)
 		}
 		var paths []string
 		for _, root := range []string{filepath.Dir(appExe), cwd} {
 			paths = append(paths,
 				filepath.Join(root),
 				filepath.Join(root, "windows-"+runtime.GOARCH),
 				filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
 			)
 		}
 		// Try a few variations to improve developer experience when building from source in the local tree
 		for _, p := range paths {
 			candidate := filepath.Join(p, "ollama_runners")
 			_, err := os.Stat(candidate)
 			if err == nil {
 				RunnersDir = candidate
 				break
 			}
 		}
 		if RunnersDir == "" {
 			slog.Error("unable to locate llm runner directory.  Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
 		}
 	}
 	TmpDir = clean("OLLAMA_TMPDIR")
 	userLimit := clean("OLLAMA_MAX_VRAM")
 	if userLimit != "" {
 		avail, err := strconv.ParseUint(userLimit, 10, 64)
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
 		} else {
 			MaxVRAM = avail
 		}
 	}
 	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
 		val, err := strconv.Atoi(onp)
 		if err != nil || val <= 0 {
 			slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
 		} else {
 			NumParallel = val
 		}
 	}
 	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
 		NoPrune = true
 	}
 	if origins := clean("OLLAMA_ORIGINS"); origins != "" {
 		AllowOrigins = strings.Split(origins, ",")
 	}
 	for _, allowOrigin := range defaultAllowOrigins {
 		AllowOrigins = append(AllowOrigins,
 			fmt.Sprintf("http://%s", allowOrigin),
 			fmt.Sprintf("https://%s", allowOrigin),
 			fmt.Sprintf("http://%s:*", allowOrigin),
 			fmt.Sprintf("https://%s:*", allowOrigin),
 		)
 	}
 	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
 		if err != nil {
 			slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
 		} else {
 			MaxRunners = m
 		}
 	}
 	if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
 		p, err := strconv.Atoi(onp)
 		if err != nil || p <= 0 {
 			slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
 		} else {
 			MaxQueuedRequests = p
 		}
 	}
 }
--- a/server/envconfig/config_test.go
+++ b/server/envconfig/config_test.go
@ -0,0 +1,20 @@
 package envconfig
 import (
 	"os"
 	"testing"
 	"github.com/stretchr/testify/require"
 )
 func TestConfig(t *testing.T) {
 	os.Setenv("OLLAMA_DEBUG", "")
 	LoadConfig()
 	require.False(t, Debug)
 	os.Setenv("OLLAMA_DEBUG", "false")
 	LoadConfig()
 	require.False(t, Debug)
 	os.Setenv("OLLAMA_DEBUG", "1")
 	LoadConfig()
 	require.True(t, Debug)
 }
--- a/server/images.go
+++ b/server/images.go
@ -29,6 +29,7 @@ import (
 	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/server/envconfig"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
@ -695,7 +696,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
 		return err
 	}
-	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
+	if !envconfig.NoPrune {
 		if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
 			return err
 		}
@ -1026,7 +1027,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
-	if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
+	if !envconfig.NoPrune {
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
--- a/server/routes.go
+++ b/server/routes.go
@ -29,6 +29,7 @@ import (
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/server/envconfig"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/version"
 )
@ -859,12 +860,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
 	c.Status(http.StatusCreated)
 }
 var defaultAllowOrigins = []string{
 	"localhost",
 	"127.0.0.1",
 	"0.0.0.0",
 }
 func isLocalIP(ip netip.Addr) bool {
 	if interfaces, err := net.Interfaces(); err == nil {
 		for _, iface := range interfaces {
@ -948,19 +943,7 @@ func (s *Server) GenerateRoutes() http.Handler {
 	config := cors.DefaultConfig()
 	config.AllowWildcard = true
 	config.AllowBrowserExtensions = true
-
+	config.AllowOrigins = envconfig.AllowOrigins
 	if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
 		config.AllowOrigins = strings.Split(allowedOrigins, ",")
 	}
 	for _, allowOrigin := range defaultAllowOrigins {
 		config.AllowOrigins = append(config.AllowOrigins,
 			fmt.Sprintf("http://%s", allowOrigin),
 			fmt.Sprintf("https://%s", allowOrigin),
 			fmt.Sprintf("http://%s:*", allowOrigin),
 			fmt.Sprintf("https://%s:*", allowOrigin),
 		)
 	}
 	r := gin.Default()
 	r.Use(
@ -999,10 +982,11 @@ func (s *Server) GenerateRoutes() http.Handler {
 func Serve(ln net.Listener) error {
 	level := slog.LevelInfo
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		level = slog.LevelDebug
 	}
 	slog.Info("server config", "env", envconfig.AsMap())
 	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
 		Level:     level,
 		AddSource: true,
@ -1026,7 +1010,7 @@ func Serve(ln net.Listener) error {
 		return err
 	}
-	if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
+	if !envconfig.NoPrune {
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
 			return err
--- a/server/sched.go
+++ b/server/sched.go
@ -5,10 +5,8 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"reflect"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
@ -17,6 +15,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/server/envconfig"
 	"golang.org/x/exp/slices"
 )
@ -43,46 +42,14 @@ type Scheduler struct {
 	getGpuFn    func() gpu.GpuInfoList
 }
-var (
+var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
 	// TODO set this to zero after a release or two, to enable multiple models by default
 	loadedMax         = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
 	maxQueuedRequests = 512
 	numParallel       = 1
 	ErrMaxQueue       = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
 )
 func InitScheduler(ctx context.Context) *Scheduler {
 	maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
 		if err != nil {
 			slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
 		} else {
 			loadedMax = m
 		}
 	}
 	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
 		p, err := strconv.Atoi(onp)
 		if err != nil || p <= 0 {
 			slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
 		} else {
 			numParallel = p
 		}
 	}
 	if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
 		p, err := strconv.Atoi(onp)
 		if err != nil || p <= 0 {
 			slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
 		} else {
 			maxQueuedRequests = p
 		}
 	}
 	sched := &Scheduler{
-		pendingReqCh:  make(chan *LlmRequest, maxQueuedRequests),
+		pendingReqCh:  make(chan *LlmRequest, envconfig.MaxQueuedRequests),
-		finishedReqCh: make(chan *LlmRequest, maxQueuedRequests),
+		finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
-		expiredCh:     make(chan *runnerRef, maxQueuedRequests),
+		expiredCh:     make(chan *runnerRef, envconfig.MaxQueuedRequests),
-		unloadedCh:    make(chan interface{}, maxQueuedRequests),
+		unloadedCh:    make(chan interface{}, envconfig.MaxQueuedRequests),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		getGpuFn:      gpu.GetGPUInfo,
@ -94,7 +61,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
 // context must be canceled to decrement ref count and release the runner
 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
 	// allocate a large enough kv cache for all parallel requests
-	opts.NumCtx = opts.NumCtx * numParallel
+	opts.NumCtx = opts.NumCtx * envconfig.NumParallel
 	req := &LlmRequest{
 		ctx:             c,
@ -147,11 +114,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if loadedMax > 0 && loadedCount >= loadedMax {
+				} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload(pending)
 				} else {
-					// Either no models are loaded or below loadedMax
+					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
 					gpus := s.getGpuFn()
@ -162,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
-					// If we're CPU only mode, just limit by loadedMax above
+					// If we're CPU only mode, just limit by envconfig.MaxRunners above
 					// TODO handle system memory exhaustion
 					if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
 						slog.Debug("cpu mode with existing models, loading")
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -15,6 +15,7 @@ import (
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/server/envconfig"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@ -27,34 +28,10 @@ func init() {
 func TestInitScheduler(t *testing.T) {
 	ctx, done := context.WithCancel(context.Background())
 	defer done()
 	initialMax := loadedMax
 	initialParallel := numParallel
 	s := InitScheduler(ctx)
 	require.Equal(t, initialMax, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
 	s = InitScheduler(ctx)
 	require.Equal(t, initialMax, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
 	s = InitScheduler(ctx)
 	require.Equal(t, 0, loadedMax)
 	s.loadedMu.Lock()
 	require.NotNil(t, s.loaded)
 	s.loadedMu.Unlock()
 	os.Setenv("OLLAMA_NUM_PARALLEL", "blue")
 	_ = InitScheduler(ctx)
 	require.Equal(t, initialParallel, numParallel)
 	os.Setenv("OLLAMA_NUM_PARALLEL", "10")
 	_ = InitScheduler(ctx)
 	require.Equal(t, 10, numParallel)
 }
 func TestLoad(t *testing.T) {
@ -249,7 +226,7 @@ func TestRequests(t *testing.T) {
 		t.Errorf("timeout")
 	}
-	loadedMax = 1
+	envconfig.MaxRunners = 1
 	s.newServerFn = scenario3a.newServer
 	slog.Info("scenario3a")
 	s.pendingReqCh <- scenario3a.req
@ -268,7 +245,7 @@ func TestRequests(t *testing.T) {
 	require.Len(t, s.loaded, 1)
 	s.loadedMu.Unlock()
-	loadedMax = 0
+	envconfig.MaxRunners = 0
 	s.newServerFn = scenario3b.newServer
 	slog.Info("scenario3b")
 	s.pendingReqCh <- scenario3b.req
@ -339,7 +316,7 @@ func TestGetRunner(t *testing.T) {
 	scenario1b.req.sessionDuration = 0
 	scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
 	scenario1c.req.sessionDuration = 0
-	maxQueuedRequests = 1
+	envconfig.MaxQueuedRequests = 1
 	s := InitScheduler(ctx)
 	s.getGpuFn = func() gpu.GpuInfoList {
 		g := gpu.GpuInfo{Library: "metal"}