Centralize server config handling
This moves all the env var reading into one central module and logs the loaded config once at startup which should help in troubleshooting user server logs
This commit is contained in:
parent
6707768ebd
commit
f56aa20014
|
@ -5,12 +5,14 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
)
|
)
|
||||||
|
|
||||||
func InitLogging() {
|
func InitLogging() {
|
||||||
level := slog.LevelInfo
|
level := slog.LevelInfo
|
||||||
|
|
||||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
if envconfig.Debug {
|
||||||
level = slog.LevelDebug
|
level = slog.LevelDebug
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,16 +31,13 @@ func DoUpgrade(cancel context.CancelFunc, done chan int) error {
|
||||||
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
|
"/LOG=" + filepath.Base(UpgradeLogFile), // Only relative seems reliable, so set pwd
|
||||||
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
|
"/FORCECLOSEAPPLICATIONS", // Force close the tray app - might be needed
|
||||||
}
|
}
|
||||||
// When we're not in debug mode, make the upgrade as quiet as possible (no GUI, no prompts)
|
// make the upgrade as quiet as possible (no GUI, no prompts)
|
||||||
// TODO - temporarily disable since we're pinning in debug mode for the preview
|
|
||||||
// if debug := os.Getenv("OLLAMA_DEBUG"); debug == "" {
|
|
||||||
installArgs = append(installArgs,
|
installArgs = append(installArgs,
|
||||||
"/SP", // Skip the "This will install... Do you wish to continue" prompt
|
"/SP", // Skip the "This will install... Do you wish to continue" prompt
|
||||||
"/SUPPRESSMSGBOXES",
|
"/SUPPRESSMSGBOXES",
|
||||||
"/SILENT",
|
"/SILENT",
|
||||||
"/VERYSILENT",
|
"/VERYSILENT",
|
||||||
)
|
)
|
||||||
// }
|
|
||||||
|
|
||||||
// Safeguard in case we have requests in flight that need to drain...
|
// Safeguard in case we have requests in flight that need to drain...
|
||||||
slog.Info("Waiting for server to shutdown")
|
slog.Info("Waiting for server to shutdown")
|
||||||
|
|
|
@ -12,6 +12,8 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -24,45 +26,8 @@ func PayloadsDir() (string, error) {
|
||||||
defer lock.Unlock()
|
defer lock.Unlock()
|
||||||
var err error
|
var err error
|
||||||
if payloadsDir == "" {
|
if payloadsDir == "" {
|
||||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
runnersDir := envconfig.RunnersDir
|
||||||
// On Windows we do not carry the payloads inside the main executable
|
|
||||||
if runtime.GOOS == "windows" && runnersDir == "" {
|
|
||||||
appExe, err := os.Executable()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("failed to lookup executable path", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
cwd, err := os.Getwd()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("failed to lookup working directory", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
var paths []string
|
|
||||||
for _, root := range []string{filepath.Dir(appExe), cwd} {
|
|
||||||
paths = append(paths,
|
|
||||||
filepath.Join(root),
|
|
||||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
|
||||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try a few variations to improve developer experience when building from source in the local tree
|
|
||||||
for _, p := range paths {
|
|
||||||
candidate := filepath.Join(p, "ollama_runners")
|
|
||||||
_, err := os.Stat(candidate)
|
|
||||||
if err == nil {
|
|
||||||
runnersDir = candidate
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if runnersDir == "" {
|
|
||||||
err = fmt.Errorf("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
|
||||||
slog.Error("incomplete distribution", "error", err)
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if runnersDir != "" {
|
if runnersDir != "" {
|
||||||
payloadsDir = runnersDir
|
payloadsDir = runnersDir
|
||||||
return payloadsDir, nil
|
return payloadsDir, nil
|
||||||
|
@ -70,7 +35,7 @@ func PayloadsDir() (string, error) {
|
||||||
|
|
||||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
||||||
cleanupTmpDirs()
|
cleanupTmpDirs()
|
||||||
tmpDir := os.Getenv("OLLAMA_TMPDIR")
|
tmpDir := envconfig.TmpDir
|
||||||
if tmpDir == "" {
|
if tmpDir == "" {
|
||||||
tmpDir, err = os.MkdirTemp("", "ollama")
|
tmpDir, err = os.MkdirTemp("", "ollama")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -133,7 +98,7 @@ func cleanupTmpDirs() {
|
||||||
func Cleanup() {
|
func Cleanup() {
|
||||||
lock.Lock()
|
lock.Lock()
|
||||||
defer lock.Unlock()
|
defer lock.Unlock()
|
||||||
runnersDir := os.Getenv("OLLAMA_RUNNERS_DIR")
|
runnersDir := envconfig.RunnersDir
|
||||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
||||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
||||||
|
|
|
@ -21,6 +21,7 @@ import (
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
)
|
)
|
||||||
|
|
||||||
type handles struct {
|
type handles struct {
|
||||||
|
@ -268,7 +269,7 @@ func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func getVerboseState() C.uint16_t {
|
func getVerboseState() C.uint16_t {
|
||||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
if envconfig.Debug {
|
||||||
return C.uint16_t(1)
|
return C.uint16_t(1)
|
||||||
}
|
}
|
||||||
return C.uint16_t(0)
|
return C.uint16_t(0)
|
||||||
|
|
|
@ -3,12 +3,11 @@ package llm
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
|
||||||
"strconv"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
)
|
)
|
||||||
|
|
||||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||||
|
@ -50,15 +49,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
for _, info := range gpus {
|
for _, info := range gpus {
|
||||||
memoryAvailable += info.FreeMemory
|
memoryAvailable += info.FreeMemory
|
||||||
}
|
}
|
||||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
if envconfig.MaxVRAM > 0 {
|
||||||
if userLimit != "" {
|
memoryAvailable = envconfig.MaxVRAM
|
||||||
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
|
||||||
} else {
|
|
||||||
slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
|
|
||||||
memoryAvailable = avail
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
||||||
|
|
|
@ -26,6 +26,7 @@ import (
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
)
|
)
|
||||||
|
|
||||||
type LlamaServer interface {
|
type LlamaServer interface {
|
||||||
|
@ -124,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
} else {
|
} else {
|
||||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||||
}
|
}
|
||||||
demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
|
demandLib := envconfig.LLMLibrary
|
||||||
if demandLib != "" {
|
if demandLib != "" {
|
||||||
serverPath := availableServers[demandLib]
|
serverPath := availableServers[demandLib]
|
||||||
if serverPath == "" {
|
if serverPath == "" {
|
||||||
|
@ -145,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
if envconfig.Debug {
|
||||||
params = append(params, "--log-format", "json")
|
params = append(params, "--log-format", "json")
|
||||||
} else {
|
} else {
|
||||||
params = append(params, "--log-disable")
|
params = append(params, "--log-disable")
|
||||||
|
@ -155,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||||
}
|
}
|
||||||
|
|
||||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
if envconfig.Debug {
|
||||||
params = append(params, "--verbose")
|
params = append(params, "--verbose")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -194,15 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
}
|
}
|
||||||
|
|
||||||
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
|
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
|
||||||
numParallel := 1
|
numParallel := envconfig.NumParallel
|
||||||
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
|
||||||
numParallel, err = strconv.Atoi(onp)
|
|
||||||
if err != nil || numParallel <= 0 {
|
|
||||||
err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
|
|
||||||
slog.Error("misconfiguration", "error", err)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||||
|
|
||||||
for i := 0; i < len(servers); i++ {
|
for i := 0; i < len(servers); i++ {
|
||||||
|
|
174
server/envconfig/config.go
Normal file
174
server/envconfig/config.go
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
package envconfig
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Set via OLLAMA_ORIGINS in the environment
|
||||||
|
AllowOrigins []string
|
||||||
|
// Set via OLLAMA_DEBUG in the environment
|
||||||
|
Debug bool
|
||||||
|
// Set via OLLAMA_LLM_LIBRARY in the environment
|
||||||
|
LLMLibrary string
|
||||||
|
// Set via OLLAMA_MAX_LOADED_MODELS in the environment
|
||||||
|
MaxRunners int
|
||||||
|
// Set via OLLAMA_MAX_QUEUE in the environment
|
||||||
|
MaxQueuedRequests int
|
||||||
|
// Set via OLLAMA_MAX_VRAM in the environment
|
||||||
|
MaxVRAM uint64
|
||||||
|
// Set via OLLAMA_NOPRUNE in the environment
|
||||||
|
NoPrune bool
|
||||||
|
// Set via OLLAMA_NUM_PARALLEL in the environment
|
||||||
|
NumParallel int
|
||||||
|
// Set via OLLAMA_RUNNERS_DIR in the environment
|
||||||
|
RunnersDir string
|
||||||
|
// Set via OLLAMA_TMPDIR in the environment
|
||||||
|
TmpDir string
|
||||||
|
)
|
||||||
|
|
||||||
|
func AsMap() map[string]string {
|
||||||
|
return map[string]string{
|
||||||
|
"OLLAMA_ORIGINS": fmt.Sprintf("%v", AllowOrigins),
|
||||||
|
"OLLAMA_DEBUG": fmt.Sprintf("%v", Debug),
|
||||||
|
"OLLAMA_LLM_LIBRARY": fmt.Sprintf("%v", LLMLibrary),
|
||||||
|
"OLLAMA_MAX_LOADED_MODELS": fmt.Sprintf("%v", MaxRunners),
|
||||||
|
"OLLAMA_MAX_QUEUE": fmt.Sprintf("%v", MaxQueuedRequests),
|
||||||
|
"OLLAMA_MAX_VRAM": fmt.Sprintf("%v", MaxVRAM),
|
||||||
|
"OLLAMA_NOPRUNE": fmt.Sprintf("%v", NoPrune),
|
||||||
|
"OLLAMA_NUM_PARALLEL": fmt.Sprintf("%v", NumParallel),
|
||||||
|
"OLLAMA_RUNNERS_DIR": fmt.Sprintf("%v", RunnersDir),
|
||||||
|
"OLLAMA_TMPDIR": fmt.Sprintf("%v", TmpDir),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var defaultAllowOrigins = []string{
|
||||||
|
"localhost",
|
||||||
|
"127.0.0.1",
|
||||||
|
"0.0.0.0",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean quotes and spaces from the value
|
||||||
|
func clean(key string) string {
|
||||||
|
return strings.Trim(os.Getenv(key), "\"' ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// default values
|
||||||
|
NumParallel = 1
|
||||||
|
MaxRunners = 1
|
||||||
|
MaxQueuedRequests = 512
|
||||||
|
|
||||||
|
LoadConfig()
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadConfig() {
|
||||||
|
if debug := clean("OLLAMA_DEBUG"); debug != "" {
|
||||||
|
d, err := strconv.ParseBool(debug)
|
||||||
|
if err == nil {
|
||||||
|
Debug = d
|
||||||
|
} else {
|
||||||
|
Debug = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RunnersDir = clean("OLLAMA_RUNNERS_DIR")
|
||||||
|
if runtime.GOOS == "windows" && RunnersDir == "" {
|
||||||
|
// On Windows we do not carry the payloads inside the main executable
|
||||||
|
appExe, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("failed to lookup executable path", "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("failed to lookup working directory", "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var paths []string
|
||||||
|
for _, root := range []string{filepath.Dir(appExe), cwd} {
|
||||||
|
paths = append(paths,
|
||||||
|
filepath.Join(root),
|
||||||
|
filepath.Join(root, "windows-"+runtime.GOARCH),
|
||||||
|
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try a few variations to improve developer experience when building from source in the local tree
|
||||||
|
for _, p := range paths {
|
||||||
|
candidate := filepath.Join(p, "ollama_runners")
|
||||||
|
_, err := os.Stat(candidate)
|
||||||
|
if err == nil {
|
||||||
|
RunnersDir = candidate
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if RunnersDir == "" {
|
||||||
|
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TmpDir = clean("OLLAMA_TMPDIR")
|
||||||
|
|
||||||
|
userLimit := clean("OLLAMA_MAX_VRAM")
|
||||||
|
if userLimit != "" {
|
||||||
|
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
||||||
|
} else {
|
||||||
|
MaxVRAM = avail
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
|
||||||
|
|
||||||
|
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||||
|
val, err := strconv.Atoi(onp)
|
||||||
|
if err != nil || val <= 0 {
|
||||||
|
slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
||||||
|
} else {
|
||||||
|
NumParallel = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
|
||||||
|
NoPrune = true
|
||||||
|
}
|
||||||
|
|
||||||
|
if origins := clean("OLLAMA_ORIGINS"); origins != "" {
|
||||||
|
AllowOrigins = strings.Split(origins, ",")
|
||||||
|
}
|
||||||
|
for _, allowOrigin := range defaultAllowOrigins {
|
||||||
|
AllowOrigins = append(AllowOrigins,
|
||||||
|
fmt.Sprintf("http://%s", allowOrigin),
|
||||||
|
fmt.Sprintf("https://%s", allowOrigin),
|
||||||
|
fmt.Sprintf("http://%s:*", allowOrigin),
|
||||||
|
fmt.Sprintf("https://%s:*", allowOrigin),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
|
||||||
|
if maxRunners != "" {
|
||||||
|
m, err := strconv.Atoi(maxRunners)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
|
||||||
|
} else {
|
||||||
|
MaxRunners = m
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
|
||||||
|
p, err := strconv.Atoi(onp)
|
||||||
|
if err != nil || p <= 0 {
|
||||||
|
slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
|
||||||
|
} else {
|
||||||
|
MaxQueuedRequests = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
20
server/envconfig/config_test.go
Normal file
20
server/envconfig/config_test.go
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
package envconfig
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestConfig(t *testing.T) {
|
||||||
|
os.Setenv("OLLAMA_DEBUG", "")
|
||||||
|
LoadConfig()
|
||||||
|
require.False(t, Debug)
|
||||||
|
os.Setenv("OLLAMA_DEBUG", "false")
|
||||||
|
LoadConfig()
|
||||||
|
require.False(t, Debug)
|
||||||
|
os.Setenv("OLLAMA_DEBUG", "1")
|
||||||
|
LoadConfig()
|
||||||
|
require.True(t, Debug)
|
||||||
|
}
|
|
@ -29,6 +29,7 @@ import (
|
||||||
"github.com/ollama/ollama/convert"
|
"github.com/ollama/ollama/convert"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
|
@ -695,7 +696,7 @@ func CreateModel(ctx context.Context, name, modelFileDir, quantization string, m
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
if !envconfig.NoPrune {
|
||||||
if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
|
if err := deleteUnusedLayers(nil, deleteMap, false); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -1026,7 +1027,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||||
// build deleteMap to prune unused layers
|
// build deleteMap to prune unused layers
|
||||||
deleteMap := make(map[string]struct{})
|
deleteMap := make(map[string]struct{})
|
||||||
|
|
||||||
if noprune = os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
if !envconfig.NoPrune {
|
||||||
manifest, _, err = GetManifest(mp)
|
manifest, _, err = GetManifest(mp)
|
||||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -29,6 +29,7 @@ import (
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
)
|
)
|
||||||
|
@ -859,12 +860,6 @@ func (s *Server) CreateBlobHandler(c *gin.Context) {
|
||||||
c.Status(http.StatusCreated)
|
c.Status(http.StatusCreated)
|
||||||
}
|
}
|
||||||
|
|
||||||
var defaultAllowOrigins = []string{
|
|
||||||
"localhost",
|
|
||||||
"127.0.0.1",
|
|
||||||
"0.0.0.0",
|
|
||||||
}
|
|
||||||
|
|
||||||
func isLocalIP(ip netip.Addr) bool {
|
func isLocalIP(ip netip.Addr) bool {
|
||||||
if interfaces, err := net.Interfaces(); err == nil {
|
if interfaces, err := net.Interfaces(); err == nil {
|
||||||
for _, iface := range interfaces {
|
for _, iface := range interfaces {
|
||||||
|
@ -948,19 +943,7 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||||
config := cors.DefaultConfig()
|
config := cors.DefaultConfig()
|
||||||
config.AllowWildcard = true
|
config.AllowWildcard = true
|
||||||
config.AllowBrowserExtensions = true
|
config.AllowBrowserExtensions = true
|
||||||
|
config.AllowOrigins = envconfig.AllowOrigins
|
||||||
if allowedOrigins := strings.Trim(os.Getenv("OLLAMA_ORIGINS"), "\"'"); allowedOrigins != "" {
|
|
||||||
config.AllowOrigins = strings.Split(allowedOrigins, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, allowOrigin := range defaultAllowOrigins {
|
|
||||||
config.AllowOrigins = append(config.AllowOrigins,
|
|
||||||
fmt.Sprintf("http://%s", allowOrigin),
|
|
||||||
fmt.Sprintf("https://%s", allowOrigin),
|
|
||||||
fmt.Sprintf("http://%s:*", allowOrigin),
|
|
||||||
fmt.Sprintf("https://%s:*", allowOrigin),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
r := gin.Default()
|
r := gin.Default()
|
||||||
r.Use(
|
r.Use(
|
||||||
|
@ -999,10 +982,11 @@ func (s *Server) GenerateRoutes() http.Handler {
|
||||||
|
|
||||||
func Serve(ln net.Listener) error {
|
func Serve(ln net.Listener) error {
|
||||||
level := slog.LevelInfo
|
level := slog.LevelInfo
|
||||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
if envconfig.Debug {
|
||||||
level = slog.LevelDebug
|
level = slog.LevelDebug
|
||||||
}
|
}
|
||||||
|
|
||||||
|
slog.Info("server config", "env", envconfig.AsMap())
|
||||||
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||||
Level: level,
|
Level: level,
|
||||||
AddSource: true,
|
AddSource: true,
|
||||||
|
@ -1026,7 +1010,7 @@ func Serve(ln net.Listener) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if noprune := os.Getenv("OLLAMA_NOPRUNE"); noprune == "" {
|
if !envconfig.NoPrune {
|
||||||
// clean up unused layers and manifests
|
// clean up unused layers and manifests
|
||||||
if err := PruneLayers(); err != nil {
|
if err := PruneLayers(); err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -5,10 +5,8 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
|
||||||
"reflect"
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
@ -17,6 +15,7 @@ import (
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
"golang.org/x/exp/slices"
|
"golang.org/x/exp/slices"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -43,46 +42,14 @@ type Scheduler struct {
|
||||||
getGpuFn func() gpu.GpuInfoList
|
getGpuFn func() gpu.GpuInfoList
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
||||||
// TODO set this to zero after a release or two, to enable multiple models by default
|
|
||||||
loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
|
|
||||||
maxQueuedRequests = 512
|
|
||||||
numParallel = 1
|
|
||||||
ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
|
||||||
)
|
|
||||||
|
|
||||||
func InitScheduler(ctx context.Context) *Scheduler {
|
func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
|
|
||||||
if maxRunners != "" {
|
|
||||||
m, err := strconv.Atoi(maxRunners)
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
|
|
||||||
} else {
|
|
||||||
loadedMax = m
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
|
||||||
p, err := strconv.Atoi(onp)
|
|
||||||
if err != nil || p <= 0 {
|
|
||||||
slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
|
||||||
} else {
|
|
||||||
numParallel = p
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
|
|
||||||
p, err := strconv.Atoi(onp)
|
|
||||||
if err != nil || p <= 0 {
|
|
||||||
slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
|
|
||||||
} else {
|
|
||||||
maxQueuedRequests = p
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sched := &Scheduler{
|
sched := &Scheduler{
|
||||||
pendingReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
pendingReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
|
||||||
finishedReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
finishedReqCh: make(chan *LlmRequest, envconfig.MaxQueuedRequests),
|
||||||
expiredCh: make(chan *runnerRef, maxQueuedRequests),
|
expiredCh: make(chan *runnerRef, envconfig.MaxQueuedRequests),
|
||||||
unloadedCh: make(chan interface{}, maxQueuedRequests),
|
unloadedCh: make(chan interface{}, envconfig.MaxQueuedRequests),
|
||||||
loaded: make(map[string]*runnerRef),
|
loaded: make(map[string]*runnerRef),
|
||||||
newServerFn: llm.NewLlamaServer,
|
newServerFn: llm.NewLlamaServer,
|
||||||
getGpuFn: gpu.GetGPUInfo,
|
getGpuFn: gpu.GetGPUInfo,
|
||||||
|
@ -94,7 +61,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
// context must be canceled to decrement ref count and release the runner
|
// context must be canceled to decrement ref count and release the runner
|
||||||
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
||||||
// allocate a large enough kv cache for all parallel requests
|
// allocate a large enough kv cache for all parallel requests
|
||||||
opts.NumCtx = opts.NumCtx * numParallel
|
opts.NumCtx = opts.NumCtx * envconfig.NumParallel
|
||||||
|
|
||||||
req := &LlmRequest{
|
req := &LlmRequest{
|
||||||
ctx: c,
|
ctx: c,
|
||||||
|
@ -147,11 +114,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
pending.useLoadedRunner(runner, s.finishedReqCh)
|
pending.useLoadedRunner(runner, s.finishedReqCh)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
} else if loadedMax > 0 && loadedCount >= loadedMax {
|
} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
|
||||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||||
runnerToExpire = s.findRunnerToUnload(pending)
|
runnerToExpire = s.findRunnerToUnload(pending)
|
||||||
} else {
|
} else {
|
||||||
// Either no models are loaded or below loadedMax
|
// Either no models are loaded or below envconfig.MaxRunners
|
||||||
// Get a refreshed GPU list
|
// Get a refreshed GPU list
|
||||||
gpus := s.getGpuFn()
|
gpus := s.getGpuFn()
|
||||||
|
|
||||||
|
@ -162,7 +129,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're CPU only mode, just limit by loadedMax above
|
// If we're CPU only mode, just limit by envconfig.MaxRunners above
|
||||||
// TODO handle system memory exhaustion
|
// TODO handle system memory exhaustion
|
||||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
|
if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
|
||||||
slog.Debug("cpu mode with existing models, loading")
|
slog.Debug("cpu mode with existing models, loading")
|
||||||
|
|
|
@ -15,6 +15,7 @@ import (
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
|
"github.com/ollama/ollama/server/envconfig"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
@ -27,34 +28,10 @@ func init() {
|
||||||
func TestInitScheduler(t *testing.T) {
|
func TestInitScheduler(t *testing.T) {
|
||||||
ctx, done := context.WithCancel(context.Background())
|
ctx, done := context.WithCancel(context.Background())
|
||||||
defer done()
|
defer done()
|
||||||
initialMax := loadedMax
|
|
||||||
initialParallel := numParallel
|
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
require.Equal(t, initialMax, loadedMax)
|
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
require.NotNil(t, s.loaded)
|
require.NotNil(t, s.loaded)
|
||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
|
|
||||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
|
|
||||||
s = InitScheduler(ctx)
|
|
||||||
require.Equal(t, initialMax, loadedMax)
|
|
||||||
s.loadedMu.Lock()
|
|
||||||
require.NotNil(t, s.loaded)
|
|
||||||
s.loadedMu.Unlock()
|
|
||||||
|
|
||||||
os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
|
|
||||||
s = InitScheduler(ctx)
|
|
||||||
require.Equal(t, 0, loadedMax)
|
|
||||||
s.loadedMu.Lock()
|
|
||||||
require.NotNil(t, s.loaded)
|
|
||||||
s.loadedMu.Unlock()
|
|
||||||
|
|
||||||
os.Setenv("OLLAMA_NUM_PARALLEL", "blue")
|
|
||||||
_ = InitScheduler(ctx)
|
|
||||||
require.Equal(t, initialParallel, numParallel)
|
|
||||||
os.Setenv("OLLAMA_NUM_PARALLEL", "10")
|
|
||||||
_ = InitScheduler(ctx)
|
|
||||||
require.Equal(t, 10, numParallel)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestLoad(t *testing.T) {
|
func TestLoad(t *testing.T) {
|
||||||
|
@ -249,7 +226,7 @@ func TestRequests(t *testing.T) {
|
||||||
t.Errorf("timeout")
|
t.Errorf("timeout")
|
||||||
}
|
}
|
||||||
|
|
||||||
loadedMax = 1
|
envconfig.MaxRunners = 1
|
||||||
s.newServerFn = scenario3a.newServer
|
s.newServerFn = scenario3a.newServer
|
||||||
slog.Info("scenario3a")
|
slog.Info("scenario3a")
|
||||||
s.pendingReqCh <- scenario3a.req
|
s.pendingReqCh <- scenario3a.req
|
||||||
|
@ -268,7 +245,7 @@ func TestRequests(t *testing.T) {
|
||||||
require.Len(t, s.loaded, 1)
|
require.Len(t, s.loaded, 1)
|
||||||
s.loadedMu.Unlock()
|
s.loadedMu.Unlock()
|
||||||
|
|
||||||
loadedMax = 0
|
envconfig.MaxRunners = 0
|
||||||
s.newServerFn = scenario3b.newServer
|
s.newServerFn = scenario3b.newServer
|
||||||
slog.Info("scenario3b")
|
slog.Info("scenario3b")
|
||||||
s.pendingReqCh <- scenario3b.req
|
s.pendingReqCh <- scenario3b.req
|
||||||
|
@ -339,7 +316,7 @@ func TestGetRunner(t *testing.T) {
|
||||||
scenario1b.req.sessionDuration = 0
|
scenario1b.req.sessionDuration = 0
|
||||||
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
|
scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
|
||||||
scenario1c.req.sessionDuration = 0
|
scenario1c.req.sessionDuration = 0
|
||||||
maxQueuedRequests = 1
|
envconfig.MaxQueuedRequests = 1
|
||||||
s := InitScheduler(ctx)
|
s := InitScheduler(ctx)
|
||||||
s.getGpuFn = func() gpu.GpuInfoList {
|
s.getGpuFn = func() gpu.GpuInfoList {
|
||||||
g := gpu.GpuInfo{Library: "metal"}
|
g := gpu.GpuInfo{Library: "metal"}
|
||||||
|
|
Loading…
Reference in a new issue