From 6fd04ca922e5da7ef8c52d86118fc58b798a7e4a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 18 May 2024 12:34:31 -0700 Subject: [PATCH] Improve multi-gpu handling at the limit Still not complete, needs some refinement to our prediction to understand the discrete GPUs available space so we can see how many layers fit in each one since we can't split one layer across multiple GPUs we can't treat free space as one logical block --- gpu/amd_linux.go | 2 +- gpu/gpu.go | 4 - gpu/types.go | 6 +- integration/concurrency_test.go | 20 ++- integration/context_test.go | 2 +- integration/utils_test.go | 2 +- llm/ggml.go | 1 + llm/memory.go | 267 ++++++++++++++++++++++++++------ llm/memory_test.go | 116 ++++++++++++++ llm/server.go | 59 +++---- server/sched_test.go | 1 + 11 files changed, 390 insertions(+), 90 deletions(-) create mode 100644 llm/memory_test.go diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 5c45165f..97c8274f 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -27,7 +27,7 @@ const ( GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line // Direct Rendering Manager sysfs location - DRMDeviceDirGlob = "/sys/class/drm/card[0-9]/device" + DRMDeviceDirGlob = "/sys/class/drm/card*/device" DRMTotalMemoryFile = "mem_info_vram_total" DRMUsedMemoryFile = "mem_info_vram_used" diff --git a/gpu/gpu.go b/gpu/gpu.go index 5dcab592..1832667b 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - // TODO - implement - - // TODO refine the discovery to only gather total memory - // On windows we bundle the nvidia library one level above the runner dir depPath := "" if runtime.GOOS == "windows" && envconfig.RunnersDir != "" { diff --git a/gpu/types.go b/gpu/types.go index c712af90..a633e6c7 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -44,14 +44,14 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - index int // device index + index int // nolint: unused } type CudaGPUInfoList []CudaGPUInfo type RocmGPUInfo struct { GpuInfo - usedFilepath string // linux - index int // device index on windows + usedFilepath string // nolint: unused + index int // nolint: unused } type RocmGPUInfoList []RocmGPUInfo diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index 1f341fc3..e4bc872c 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) { } resp = [2][]string{ []string{"sunlight"}, - []string{"england", "english", "massachusetts", "pilgrims"}, + []string{"england", "english", "massachusetts", "pilgrims", "british"}, } ) var wg sync.WaitGroup @@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) { } }(i) } + go func() { + for { + time.Sleep(2 * time.Second) + select { + case <-ctx.Done(): + return + default: + models, err := client.ListRunning(ctx) + if err != nil { + slog.Warn("failed to list running models", "error", err) + continue + } + for _, m := range models.Models { + slog.Info("loaded model snapshot", "model", m) + } + } + } + }() wg.Wait() } diff --git a/integration/context_test.go b/integration/context_test.go index 08033125..75efb435 100644 --- a/integration/context_test.go +++ b/integration/context_test.go @@ -11,7 +11,7 @@ import ( ) func TestContextExhaustion(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter? + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs defer cancel() // Set up the test data req := api.GenerateRequest{ diff --git a/integration/utils_test.go b/integration/utils_test.go index c6f19e98..5da6fc72 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) { [][]string{ []string{"sunlight"}, []string{"soil", "organic", "earth", "black", "tan"}, - []string{"england", "english", "massachusetts", "pilgrims"}, + []string{"england", "english", "massachusetts", "pilgrims", "british"}, []string{"fourth", "july", "declaration", "independence"}, []string{"nitrogen", "oxygen", "carbon", "dioxide"}, } diff --git a/llm/ggml.go b/llm/ggml.go index 645447d5..35b89d16 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui partialOffload = 4 * batch * embedding partialOffload += max( + // 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()), 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV), 4*batch*(embedding+vocab)+embedding*vocab*105/128, ) diff --git a/llm/memory.go b/llm/memory.go index 1c2e476b..6f830cb1 100644 --- a/llm/memory.go +++ b/llm/memory.go @@ -3,9 +3,10 @@ package llm import ( "fmt" "log/slog" + "strconv" + "strings" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" ) @@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors var estimatedVRAM uint64 for _, gpus := range allGpus.ByLibrary() { var layerCount int - layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate := EstimateGPULayers(gpus, ggml, projectors, opts) + layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize if opts.NumGPU < 0 { if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) { return true, estimatedVRAM @@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors return false, estimatedVRAM } +type MemoryEstimate struct { + // How many layers we predict we can load + Layers int + + // The size of the graph which occupies the main GPU + Graph uint64 + + // How much VRAM will be allocated given the number of layers we predict + VRAMSize uint64 + + // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize + TotalSize uint64 + + // For multi-GPU scenarios, this provides the tensor split parameter + TensorSplit string + + // For multi-GPU scenarios, this is the size in bytes per GPU + GPUSizes []uint64 +} + // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size // The GPUs provided must all be the same Library -func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) { - var memoryAvailable uint64 - for _, info := range gpus { - memoryAvailable += info.FreeMemory - } - if envconfig.MaxVRAM > 0 { - memoryAvailable = envconfig.MaxVRAM - } +func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate { + // Graph size for a partial offload, applies to all GPUs + var graphPartialOffload uint64 - slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable)) + // Graph size when all layers are offloaded, applies to all GPUs + var graphFullOffload uint64 - // TODO - this is probably wrong, first GPU vs secondaries will have different overheads - memoryMinimum := gpus[0].MinimumMemory + // Final graph offload once we know full or partial + var graphOffload uint64 + + // Projectors loaded into GPU0 only + var projectorSize uint64 + + // Conditional output size on GPU 0 + var memoryLayerOutput uint64 + var includeOutput bool + + // One extra layer as a pad for each GPU + var layerBuffer uint64 + + // The sizes of the main layers + var layerSizes []uint64 + + // The sum of all the layer sizes (just for logging) + var memoryWeights uint64 + + // True if all the layers are loaded + var fullyLoaded bool + + // Overflow that didn't fit into the GPU + var overflow uint64 + + availableList := make([]string, len(gpus)) + for i, gpu := range gpus { + availableList[i] = format.HumanBytes2(gpu.FreeMemory) + } + slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList) for _, projector := range projectors { - memoryMinimum += projectorMemoryRequirements(projector) + projectorSize += projectorMemoryRequirements(projector) // multimodal models require at least 2048 context opts.NumCtx = max(opts.NumCtx, 2048) @@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts layers := ggml.Tensors().Layers() // add one layer worth of memory as a buffer if blk0, ok := layers["blk.0"]; ok { - memoryMinimum += blk0.size() + layerBuffer = blk0.size() } // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() - graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) + graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) if graphPartialOffload == 0 { graphPartialOffload = ggml.KV().GQA() * kv / 6 } - if graphFullOffload == 0 { graphFullOffload = graphPartialOffload } - graphFullOffload *= uint64(len(gpus)) - graphPartialOffload *= uint64(len(gpus)) - // on metal there's no partial offload overhead if gpus[0].Library == "metal" { graphPartialOffload = graphFullOffload } - // memoryRequiredTotal represents the memory required for full GPU offloading (all layers) - memoryRequiredTotal := memoryMinimum + graphFullOffload - - // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) - memoryRequiredPartial := memoryMinimum + graphPartialOffload - - var memoryLayerOutput uint64 if layer, ok := layers["output_norm"]; ok { memoryLayerOutput += layer.size() } - if layer, ok := layers["output"]; ok { memoryLayerOutput += layer.size() } else if layer, ok := layers["token_embd"]; ok { @@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts } if gpus[0].Library == "metal" && opts.UseMMap { - // memory is preallocated for output tensors - memoryRequiredTotal += memoryLayerOutput - memoryRequiredPartial += memoryLayerOutput + includeOutput = true + } else if gpus[0].Library != "metal" || !opts.UseMMap { + includeOutput = true } + gpuZeroOverhead := projectorSize + if includeOutput { + gpuZeroOverhead += memoryLayerOutput + } + + // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer var layerCount int + layerCounts := make([]int, len(gpus)) + gpuAllocations := make([]uint64, len(gpus)) + type gs struct { + i int + g *gpu.GpuInfo + } + gpusWithSpace := []gs{} + for i := range gpus { + var gzo uint64 + if len(gpusWithSpace) == 0 { + gzo = gpuZeroOverhead + } + // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer + if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer { + slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i]) + continue + } + gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]}) + gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full + } + + var gpuZeroID int + if len(gpusWithSpace) > 0 { + gpuZeroID = gpusWithSpace[0].i + gpuAllocations[gpuZeroID] += gpuZeroOverhead + } + + layerSizes = make([]uint64, int(ggml.KV().BlockCount())) for i := range int(ggml.KV().BlockCount()) { if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok { memoryLayer := blk.size() // KV is proportional to the number of layers memoryLayer += kv / ggml.KV().BlockCount() - - memoryRequiredTotal += memoryLayer - if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) { - memoryRequiredPartial += memoryLayer - layerCount++ - } + layerSizes[i] = memoryLayer + memoryWeights += memoryLayer } } - if gpus[0].Library != "metal" || !opts.UseMMap { - // memory was not preallocated for output tensors - memoryRequiredTotal += memoryLayerOutput + // For all the layers, find where they can fit on the GPU(s) + for i := range layerSizes { + if layerSizes[i] == 0 { + continue + } + if opts.NumGPU >= 0 && layerCount >= opts.NumGPU { + // Stop allocating on GPU(s) once we hit the users target NumGPU + continue + } + + // distribute the layers across the GPU(s) that have space + for j := len(gpusWithSpace); j > 0; j-- { + g := gpusWithSpace[i%j] + used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) + if g.g.FreeMemory > used+layerSizes[i] { + gpuAllocations[g.i] += layerSizes[i] + layerCounts[g.i]++ + layerCount++ + break + } else { + gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...) + } + } + + } + if layerCount >= int(ggml.KV().BlockCount()) { + fullyLoaded = true + } else { + for i := layerCount; i < int(ggml.KV().BlockCount()); i++ { + overflow += layerSizes[i] + } + } + // Find where the output fits + if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { + for j := len(gpusWithSpace); j > 0; j-- { + g := gpusWithSpace[layerCount%j] + used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) + if g.g.FreeMemory > used+memoryLayerOutput { + gpuAllocations[g.i] += memoryLayerOutput + layerCounts[g.i]++ + layerCount++ + break + } + } + if layerCount < int(ggml.KV().BlockCount())+1 { + fullyLoaded = false + overflow += memoryLayerOutput + } } - if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) { - layerCount = int(ggml.KV().BlockCount()) + 1 - memoryRequiredPartial = memoryRequiredTotal + // Add the applicable (full or partial) graph allocations + for i := range gpus { + if layerCounts[i] <= 0 { + continue + } + if fullyLoaded { + gpuAllocations[i] += graphFullOffload + } else { + gpuAllocations[i] += graphPartialOffload + } + } + if fullyLoaded { + graphOffload = graphFullOffload + } else { + graphOffload = graphPartialOffload } - memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv + // Summaries for the log + var memoryRequiredPartial, memoryRequiredTotal uint64 + for i := range gpuAllocations { + memoryRequiredPartial += gpuAllocations[i] + + } + memoryRequiredTotal = memoryRequiredPartial + overflow + + tensorSplit := "" + if len(gpus) > 1 { + splits := make([]string, len(gpus)) + for i, count := range layerCounts { + splits[i] = strconv.Itoa(count) + } + tensorSplit = strings.Join(splits, ",") + } + allocationsList := []string{} + for _, a := range gpuAllocations { + allocationsList = append(allocationsList, format.HumanBytes2(a)) + } slog.Info( "offload to gpu", @@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts "layers", // requested number of layers to offload "requested", opts.NumGPU, + // The number of layers the model has (including output) + "model", int(ggml.KV().BlockCount())+1, // estimated number of layers that can be offloaded - "real", layerCount, + "offload", layerCount, + // multi-gpu split for tesnors + "split", tensorSplit, ), slog.Group( "memory", - // memory available for offloading - "available", format.HumanBytes2(memoryAvailable), + // memory available by GPU for offloading + "available", availableList, slog.Group( "required", // memory required for full offloading @@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts "partial", format.HumanBytes2(memoryRequiredPartial), // memory of KV cache "kv", format.HumanBytes2(kv), + // Allocations across the GPUs + "allocations", allocationsList, ), slog.Group( "weights", @@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts ), ) if gpus[0].Library == "cpu" { - return 0, 0, memoryRequiredTotal + return MemoryEstimate{ + Layers: 0, + Graph: 0, + VRAMSize: 0, + TotalSize: memoryRequiredTotal, + GPUSizes: []uint64{}, + } } - if memoryRequiredPartial > memoryAvailable { + if layerCount == 0 { slog.Debug("insufficient VRAM to load any model layers") - return 0, 0, memoryRequiredTotal + return MemoryEstimate{ + Layers: 0, + Graph: 0, + VRAMSize: 0, + TotalSize: memoryRequiredTotal, + GPUSizes: []uint64{}, + } } - return layerCount, memoryRequiredPartial, memoryRequiredTotal + return MemoryEstimate{ + Layers: layerCount, + Graph: graphOffload, + VRAMSize: memoryRequiredPartial, + TotalSize: memoryRequiredTotal, + TensorSplit: tensorSplit, + GPUSizes: gpuAllocations, + } } diff --git a/llm/memory_test.go b/llm/memory_test.go new file mode 100644 index 00000000..0adbc541 --- /dev/null +++ b/llm/memory_test.go @@ -0,0 +1,116 @@ +package llm + +import ( + "bytes" + "encoding/binary" + "fmt" + "os" + "testing" + + "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/gpu" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestEstimateGPULayers(t *testing.T) { + envconfig.Debug = true + modelName := "dummy" + f, err := os.CreateTemp(t.TempDir(), modelName) + assert.Nil(t, err) + defer f.Close() + gguf := NewGGUFV3(binary.LittleEndian) + inputLayerCount := 5 + tensors := []Tensor{ + {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + } + assert.Equal(t, inputLayerCount+1, len(tensors)) + err = gguf.Encode(f, KV{ + "general.architecture": "llama", + "general.name": "name", + "llama.context_length": uint32(32), + "llama.embedding_length": uint32(4096), + "llama.block_count": uint32(inputLayerCount), + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(32), + "tokenizer.ggml.tokens": []string{" "}, + "tokenizer.ggml.scores": []float32{0}, + "tokenizer.ggml.token_type": []int32{0}, + }, tensors) + require.NoError(t, err) + + ggml, err := LoadModel(f.Name()) + require.NoError(t, err) + + // Simple CPU scenario + gpus := []gpu.GpuInfo{ + { + Library: "cpu", + }, + } + projectors := []string{} + opts := api.DefaultOptions() + estimate := EstimateGPULayers(gpus, ggml, projectors, opts) + assert.Equal(t, 0, estimate.Layers) + assert.Equal(t, uint64(0), estimate.Graph) + + // derived from the dummy ggml file above + graphPartialOffload := uint64(202377216) + graphFullOffload := uint64(171968512) + layerSize := uint64(33554436) + projectorSize := uint64(0) + memoryLayerOutput := uint64(4) + + // Dual CUDA scenario with assymetry + gpuMinimumMemory := uint64(2048) + gpus = []gpu.GpuInfo{ + { + Library: "cuda", + MinimumMemory: gpuMinimumMemory, + }, + { + Library: "cuda", + MinimumMemory: gpuMinimumMemory, + }, + } + // Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1 + for i, s := range [][]uint64{ + {1, 1, 1, 1}, + {2, 1, 2, 1}, + {2, 2, 2, 2}, + {1, 2, 1, 2}, + {3, 3, 3, 3}, + {4, 4, 3, 3}, + {6, 6, 3, 3}, + {0, 3, 0, 3}, + } { + gpus[0].FreeMemory = 0 + gpus[1].FreeMemory = 0 + gpus[0].FreeMemory += projectorSize + memoryLayerOutput + gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1 + gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1 + gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload) + gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload) + estimate = EstimateGPULayers(gpus, ggml, projectors, opts) + assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s) + assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s) + var layerSums uint64 + for _, b := range estimate.GPUSizes { + layerSums += b + } + if estimate.Layers < inputLayerCount+1 { + assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) + assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate) + } else { + assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate) + assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate) + } + } + +} diff --git a/llm/server.go b/llm/server.go index 0a815798..eb3d6365 100644 --- a/llm/server.go +++ b/llm/server.go @@ -49,13 +49,11 @@ type llmServer struct { status *StatusWriter options api.Options - // TODO - this should be broken down by GPU - estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model - estimatedTotal uint64 // Total size of model - totalLayers uint64 - gpuCount int - loadDuration time.Duration // Record how long it took the model to load - loadProgress float32 + estimate MemoryEstimate + totalLayers uint64 + gpuCount int + loadDuration time.Duration // Record how long it took the model to load + loadProgress float32 sem *semaphore.Weighted } @@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) { func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) { var err error var cpuRunner string - var estimatedVRAM uint64 - var estimatedTotal uint64 + var estimate MemoryEstimate var systemMemory uint64 gpuCount := len(gpus) if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 { @@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr cpuRunner = serverForCpu() gpuCount = 0 - _, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate = EstimateGPULayers(gpus, ggml, projectors, opts) } else { if gpus[0].Library == "metal" { memInfo, err := gpu.GetCPUMem() @@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr slog.Debug("system memory", "total", format.HumanBytes2(systemMemory)) } } - var layers int - layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts) + estimate = EstimateGPULayers(gpus, ggml, projectors, opts) switch { - case gpus[0].Library == "metal" && estimatedVRAM > systemMemory: + case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory: // disable partial offloading when model is greater than total system memory as this // can lead to locking up the system opts.NumGPU = 0 - case gpus[0].Library != "metal" && layers == 0: + case gpus[0].Library != "metal" && estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit cpuRunner = serverForCpu() gpuCount = 0 - case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu": - opts.NumGPU = layers + case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": + opts.NumGPU = estimate.Layers } } @@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) + if estimate.TensorSplit != "" { + params = append(params, "--tensor-split", estimate.TensorSplit) + } + + if estimate.TensorSplit != "" { + params = append(params, "--tensor-split", estimate.TensorSplit) + } + for i := range len(servers) { dir := availableServers[servers[i]] if dir == "" { @@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } s := &llmServer{ - port: port, - cmd: exec.Command(server, finalParams...), - status: NewStatusWriter(os.Stderr), - options: opts, - estimatedVRAM: estimatedVRAM, - estimatedTotal: estimatedTotal, - sem: semaphore.NewWeighted(int64(numParallel)), - totalLayers: ggml.KV().BlockCount() + 1, - gpuCount: gpuCount, - done: make(chan error, 1), + port: port, + cmd: exec.Command(server, finalParams...), + status: NewStatusWriter(os.Stderr), + options: opts, + estimate: estimate, + sem: semaphore.NewWeighted(int64(numParallel)), + totalLayers: ggml.KV().BlockCount() + 1, + gpuCount: gpuCount, + done: make(chan error, 1), } s.cmd.Env = os.Environ() @@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error { } func (s *llmServer) EstimatedVRAM() uint64 { - return s.estimatedVRAM + return s.estimate.VRAMSize } func (s *llmServer) EstimatedTotal() uint64 { - return s.estimatedTotal + return s.estimate.TotalSize } func parseDurationMs(ms float64) time.Duration { diff --git a/server/sched_test.go b/server/sched_test.go index f7dce6d1..e7ea5874 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV "tokenizer.ggml.token_type": []int32{0}, }, []llm.Tensor{ {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, + {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}}, }) require.NoError(t, err)