From 6fd04ca922e5da7ef8c52d86118fc58b798a7e4a Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Sat, 18 May 2024 12:34:31 -0700
Subject: [PATCH] Improve multi-gpu handling at the limit

Still not complete, needs some refinement to our prediction to understand the
discrete GPUs available space so we can see how many layers fit in each one
since we can't split one layer across multiple GPUs we can't treat free space
as one logical block
---
 gpu/amd_linux.go                |   2 +-
 gpu/gpu.go                      |   4 -
 gpu/types.go                    |   6 +-
 integration/concurrency_test.go |  20 ++-
 integration/context_test.go     |   2 +-
 integration/utils_test.go       |   2 +-
 llm/ggml.go                     |   1 +
 llm/memory.go                   | 267 ++++++++++++++++++++++++++------
 llm/memory_test.go              | 116 ++++++++++++++
 llm/server.go                   |  59 +++----
 server/sched_test.go            |   1 +
 11 files changed, 390 insertions(+), 90 deletions(-)
 create mode 100644 llm/memory_test.go

diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 5c45165f..97c8274f 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -27,7 +27,7 @@ const (
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 
 	// Direct Rendering Manager sysfs location
-	DRMDeviceDirGlob   = "/sys/class/drm/card[0-9]/device"
+	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
 	DRMTotalMemoryFile = "mem_info_vram_total"
 	DRMUsedMemoryFile  = "mem_info_vram_used"
 
diff --git a/gpu/gpu.go b/gpu/gpu.go
index 5dcab592..1832667b 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 
-		// TODO - implement
-
-		// TODO refine the discovery to only gather total memory
-
 		// On windows we bundle the nvidia library one level above the runner dir
 		depPath := ""
 		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
diff --git a/gpu/types.go b/gpu/types.go
index c712af90..a633e6c7 100644
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -44,14 +44,14 @@ type CPUInfo struct {
 
 type CudaGPUInfo struct {
 	GpuInfo
-	index int // device index
+	index int // nolint: unused
 }
 type CudaGPUInfoList []CudaGPUInfo
 
 type RocmGPUInfo struct {
 	GpuInfo
-	usedFilepath string // linux
-	index        int    // device index on windows
+	usedFilepath string // nolint: unused
+	index        int    // nolint: unused
 }
 type RocmGPUInfoList []RocmGPUInfo
 
diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go
index 1f341fc3..e4bc872c 100644
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 		}
 		resp = [2][]string{
 			[]string{"sunlight"},
-			[]string{"england", "english", "massachusetts", "pilgrims"},
+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
 		}
 	)
 	var wg sync.WaitGroup
@@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) {
 			}
 		}(i)
 	}
+	go func() {
+		for {
+			time.Sleep(2 * time.Second)
+			select {
+			case <-ctx.Done():
+				return
+			default:
+				models, err := client.ListRunning(ctx)
+				if err != nil {
+					slog.Warn("failed to list running models", "error", err)
+					continue
+				}
+				for _, m := range models.Models {
+					slog.Info("loaded model snapshot", "model", m)
+				}
+			}
+		}
+	}()
 	wg.Wait()
 }
diff --git a/integration/context_test.go b/integration/context_test.go
index 08033125..75efb435 100644
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,7 +11,7 @@ import (
 )
 
 func TestContextExhaustion(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
+	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{
diff --git a/integration/utils_test.go b/integration/utils_test.go
index c6f19e98..5da6fc72 100644
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 		[][]string{
 			[]string{"sunlight"},
 			[]string{"soil", "organic", "earth", "black", "tan"},
-			[]string{"england", "english", "massachusetts", "pilgrims"},
+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
 			[]string{"fourth", "july", "declaration", "independence"},
 			[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
 		}
diff --git a/llm/ggml.go b/llm/ggml.go
index 645447d5..35b89d16 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 
 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
+			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
diff --git a/llm/memory.go b/llm/memory.go
index 1c2e476b..6f830cb1 100644
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,9 +3,10 @@ package llm
 import (
 	"fmt"
 	"log/slog"
+	"strconv"
+	"strings"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
 				return true, estimatedVRAM
@@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	return false, estimatedVRAM
 }
 
+type MemoryEstimate struct {
+	// How many layers we predict we can load
+	Layers int
+
+	// The size of the graph which occupies the main GPU
+	Graph uint64
+
+	// How much VRAM will be allocated given the number of layers we predict
+	VRAMSize uint64
+
+	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
+	TotalSize uint64
+
+	// For multi-GPU scenarios, this provides the tensor split parameter
+	TensorSplit string
+
+	// For multi-GPU scenarios, this is the size in bytes per GPU
+	GPUSizes []uint64
+}
+
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
-	var memoryAvailable uint64
-	for _, info := range gpus {
-		memoryAvailable += info.FreeMemory
-	}
-	if envconfig.MaxVRAM > 0 {
-		memoryAvailable = envconfig.MaxVRAM
-	}
+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+	// Graph size for a partial offload, applies to all GPUs
+	var graphPartialOffload uint64
 
-	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
+	// Graph size when all layers are offloaded, applies to all GPUs
+	var graphFullOffload uint64
 
-	// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
-	memoryMinimum := gpus[0].MinimumMemory
+	// Final graph offload once we know full or partial
+	var graphOffload uint64
+
+	// Projectors loaded into GPU0 only
+	var projectorSize uint64
+
+	// Conditional output size on GPU 0
+	var memoryLayerOutput uint64
+	var includeOutput bool
+
+	// One extra layer as a pad for each GPU
+	var layerBuffer uint64
+
+	// The sizes of the main layers
+	var layerSizes []uint64
+
+	// The sum of all the layer sizes (just for logging)
+	var memoryWeights uint64
+
+	// True if all the layers are loaded
+	var fullyLoaded bool
+
+	// Overflow that didn't fit into the GPU
+	var overflow uint64
+
+	availableList := make([]string, len(gpus))
+	for i, gpu := range gpus {
+		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
+	}
+	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 
 	for _, projector := range projectors {
-		memoryMinimum += projectorMemoryRequirements(projector)
+		projectorSize += projectorMemoryRequirements(projector)
 
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	layers := ggml.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
-		memoryMinimum += blk0.size()
+		layerBuffer = blk0.size()
 	}
 
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
 
-	graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
+	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
-
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
 	}
 
-	graphFullOffload *= uint64(len(gpus))
-	graphPartialOffload *= uint64(len(gpus))
-
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
 	}
 
-	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
-	memoryRequiredTotal := memoryMinimum + graphFullOffload
-
-	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
-	memoryRequiredPartial := memoryMinimum + graphPartialOffload
-
-	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.size()
 	}
-
 	if layer, ok := layers["output"]; ok {
 		memoryLayerOutput += layer.size()
 	} else if layer, ok := layers["token_embd"]; ok {
@@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	}
 
 	if gpus[0].Library == "metal" && opts.UseMMap {
-		// memory is preallocated for output tensors
-		memoryRequiredTotal += memoryLayerOutput
-		memoryRequiredPartial += memoryLayerOutput
+		includeOutput = true
+	} else if gpus[0].Library != "metal" || !opts.UseMMap {
+		includeOutput = true
 	}
 
+	gpuZeroOverhead := projectorSize
+	if includeOutput {
+		gpuZeroOverhead += memoryLayerOutput
+	}
+
+	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
+	layerCounts := make([]int, len(gpus))
+	gpuAllocations := make([]uint64, len(gpus))
+	type gs struct {
+		i int
+		g *gpu.GpuInfo
+	}
+	gpusWithSpace := []gs{}
+	for i := range gpus {
+		var gzo uint64
+		if len(gpusWithSpace) == 0 {
+			gzo = gpuZeroOverhead
+		}
+		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
+			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
+			continue
+		}
+		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
+		gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
+	}
+
+	var gpuZeroID int
+	if len(gpusWithSpace) > 0 {
+		gpuZeroID = gpusWithSpace[0].i
+		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	}
+
+	layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
 	for i := range int(ggml.KV().BlockCount()) {
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			memoryLayer := blk.size()
 
 			// KV is proportional to the number of layers
 			memoryLayer += kv / ggml.KV().BlockCount()
-
-			memoryRequiredTotal += memoryLayer
-			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
-				memoryRequiredPartial += memoryLayer
-				layerCount++
-			}
+			layerSizes[i] = memoryLayer
+			memoryWeights += memoryLayer
 		}
 	}
 
-	if gpus[0].Library != "metal" || !opts.UseMMap {
-		// memory was not preallocated for output tensors
-		memoryRequiredTotal += memoryLayerOutput
+	// For all the layers, find where they can fit on the GPU(s)
+	for i := range layerSizes {
+		if layerSizes[i] == 0 {
+			continue
+		}
+		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
+			// Stop allocating on GPU(s) once we hit the users target NumGPU
+			continue
+		}
+
+		// distribute the layers across the GPU(s) that have space
+		for j := len(gpusWithSpace); j > 0; j-- {
+			g := gpusWithSpace[i%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			if g.g.FreeMemory > used+layerSizes[i] {
+				gpuAllocations[g.i] += layerSizes[i]
+				layerCounts[g.i]++
+				layerCount++
+				break
+			} else {
+				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
+			}
+		}
+
+	}
+	if layerCount >= int(ggml.KV().BlockCount()) {
+		fullyLoaded = true
+	} else {
+		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
+			overflow += layerSizes[i]
+		}
+	}
+	// Find where the output fits
+	if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+		for j := len(gpusWithSpace); j > 0; j-- {
+			g := gpusWithSpace[layerCount%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			if g.g.FreeMemory > used+memoryLayerOutput {
+				gpuAllocations[g.i] += memoryLayerOutput
+				layerCounts[g.i]++
+				layerCount++
+				break
+			}
+		}
+		if layerCount < int(ggml.KV().BlockCount())+1 {
+			fullyLoaded = false
+			overflow += memoryLayerOutput
+		}
 	}
 
-	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
-		layerCount = int(ggml.KV().BlockCount()) + 1
-		memoryRequiredPartial = memoryRequiredTotal
+	// Add the applicable (full or partial) graph allocations
+	for i := range gpus {
+		if layerCounts[i] <= 0 {
+			continue
+		}
+		if fullyLoaded {
+			gpuAllocations[i] += graphFullOffload
+		} else {
+			gpuAllocations[i] += graphPartialOffload
+		}
+	}
+	if fullyLoaded {
+		graphOffload = graphFullOffload
+	} else {
+		graphOffload = graphPartialOffload
 	}
 
-	memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
+	// Summaries for the log
+	var memoryRequiredPartial, memoryRequiredTotal uint64
+	for i := range gpuAllocations {
+		memoryRequiredPartial += gpuAllocations[i]
+
+	}
+	memoryRequiredTotal = memoryRequiredPartial + overflow
+
+	tensorSplit := ""
+	if len(gpus) > 1 {
+		splits := make([]string, len(gpus))
+		for i, count := range layerCounts {
+			splits[i] = strconv.Itoa(count)
+		}
+		tensorSplit = strings.Join(splits, ",")
+	}
+	allocationsList := []string{}
+	for _, a := range gpuAllocations {
+		allocationsList = append(allocationsList, format.HumanBytes2(a))
+	}
 
 	slog.Info(
 		"offload to gpu",
@@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			"layers",
 			// requested number of layers to offload
 			"requested", opts.NumGPU,
+			// The number of layers the model has (including output)
+			"model", int(ggml.KV().BlockCount())+1,
 			// estimated number of layers that can be offloaded
-			"real", layerCount,
+			"offload", layerCount,
+			// multi-gpu split for tesnors
+			"split", tensorSplit,
 		),
 		slog.Group(
 			"memory",
-			// memory available for offloading
-			"available", format.HumanBytes2(memoryAvailable),
+			// memory available by GPU for offloading
+			"available", availableList,
 			slog.Group(
 				"required",
 				// memory required for full offloading
@@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 				"partial", format.HumanBytes2(memoryRequiredPartial),
 				// memory of KV cache
 				"kv", format.HumanBytes2(kv),
+				// Allocations across the GPUs
+				"allocations", allocationsList,
 			),
 			slog.Group(
 				"weights",
@@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		),
 	)
 	if gpus[0].Library == "cpu" {
-		return 0, 0, memoryRequiredTotal
+		return MemoryEstimate{
+			Layers:    0,
+			Graph:     0,
+			VRAMSize:  0,
+			TotalSize: memoryRequiredTotal,
+			GPUSizes:  []uint64{},
+		}
 	}
-	if memoryRequiredPartial > memoryAvailable {
+	if layerCount == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
-		return 0, 0, memoryRequiredTotal
+		return MemoryEstimate{
+			Layers:    0,
+			Graph:     0,
+			VRAMSize:  0,
+			TotalSize: memoryRequiredTotal,
+			GPUSizes:  []uint64{},
+		}
 	}
 
-	return layerCount, memoryRequiredPartial, memoryRequiredTotal
+	return MemoryEstimate{
+		Layers:      layerCount,
+		Graph:       graphOffload,
+		VRAMSize:    memoryRequiredPartial,
+		TotalSize:   memoryRequiredTotal,
+		TensorSplit: tensorSplit,
+		GPUSizes:    gpuAllocations,
+	}
 }
diff --git a/llm/memory_test.go b/llm/memory_test.go
new file mode 100644
index 00000000..0adbc541
--- /dev/null
+++ b/llm/memory_test.go
@@ -0,0 +1,116 @@
+package llm
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/gpu"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestEstimateGPULayers(t *testing.T) {
+	envconfig.Debug = true
+	modelName := "dummy"
+	f, err := os.CreateTemp(t.TempDir(), modelName)
+	assert.Nil(t, err)
+	defer f.Close()
+	gguf := NewGGUFV3(binary.LittleEndian)
+	inputLayerCount := 5
+	tensors := []Tensor{
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+	}
+	assert.Equal(t, inputLayerCount+1, len(tensors))
+	err = gguf.Encode(f, KV{
+		"general.architecture":          "llama",
+		"general.name":                  "name",
+		"llama.context_length":          uint32(32),
+		"llama.embedding_length":        uint32(4096),
+		"llama.block_count":             uint32(inputLayerCount),
+		"llama.attention.head_count":    uint32(32),
+		"llama.attention.head_count_kv": uint32(32),
+		"tokenizer.ggml.tokens":         []string{" "},
+		"tokenizer.ggml.scores":         []float32{0},
+		"tokenizer.ggml.token_type":     []int32{0},
+	}, tensors)
+	require.NoError(t, err)
+
+	ggml, err := LoadModel(f.Name())
+	require.NoError(t, err)
+
+	// Simple CPU scenario
+	gpus := []gpu.GpuInfo{
+		{
+			Library: "cpu",
+		},
+	}
+	projectors := []string{}
+	opts := api.DefaultOptions()
+	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+	assert.Equal(t, 0, estimate.Layers)
+	assert.Equal(t, uint64(0), estimate.Graph)
+
+	// derived from the dummy ggml file above
+	graphPartialOffload := uint64(202377216)
+	graphFullOffload := uint64(171968512)
+	layerSize := uint64(33554436)
+	projectorSize := uint64(0)
+	memoryLayerOutput := uint64(4)
+
+	// Dual CUDA scenario with assymetry
+	gpuMinimumMemory := uint64(2048)
+	gpus = []gpu.GpuInfo{
+		{
+			Library:       "cuda",
+			MinimumMemory: gpuMinimumMemory,
+		},
+		{
+			Library:       "cuda",
+			MinimumMemory: gpuMinimumMemory,
+		},
+	}
+	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
+	for i, s := range [][]uint64{
+		{1, 1, 1, 1},
+		{2, 1, 2, 1},
+		{2, 2, 2, 2},
+		{1, 2, 1, 2},
+		{3, 3, 3, 3},
+		{4, 4, 3, 3},
+		{6, 6, 3, 3},
+		{0, 3, 0, 3},
+	} {
+		gpus[0].FreeMemory = 0
+		gpus[1].FreeMemory = 0
+		gpus[0].FreeMemory += projectorSize + memoryLayerOutput
+		gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
+		gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
+		gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
+		gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
+		assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
+		var layerSums uint64
+		for _, b := range estimate.GPUSizes {
+			layerSums += b
+		}
+		if estimate.Layers < inputLayerCount+1 {
+			assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+			assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+		} else {
+			assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+			assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+		}
+	}
+
+}
diff --git a/llm/server.go b/llm/server.go
index 0a815798..eb3d6365 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -49,13 +49,11 @@ type llmServer struct {
 	status  *StatusWriter
 	options api.Options
 
-	// TODO - this should be broken down by GPU
-	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
-	estimatedTotal uint64 // Total size of model
-	totalLayers    uint64
-	gpuCount       int
-	loadDuration   time.Duration // Record how long it took the model to load
-	loadProgress   float32
+	estimate     MemoryEstimate
+	totalLayers  uint64
+	gpuCount     int
+	loadDuration time.Duration // Record how long it took the model to load
+	loadProgress float32
 
 	sem *semaphore.Weighted
 }
@@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) {
 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
 	var err error
 	var cpuRunner string
-	var estimatedVRAM uint64
-	var estimatedTotal uint64
+	var estimate MemoryEstimate
 	var systemMemory uint64
 	gpuCount := len(gpus)
 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
@@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		cpuRunner = serverForCpu()
 		gpuCount = 0
-		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
@@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 				slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
 			}
 		}
-		var layers int
-		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 
 		switch {
-		case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
+		case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
-		case gpus[0].Library != "metal" && layers == 0:
+		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = serverForCpu()
 			gpuCount = 0
-		case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
-			opts.NumGPU = layers
+		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
+			opts.NumGPU = estimate.Layers
 		}
 	}
 
@@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
 
+	if estimate.TensorSplit != "" {
+		params = append(params, "--tensor-split", estimate.TensorSplit)
+	}
+
+	if estimate.TensorSplit != "" {
+		params = append(params, "--tensor-split", estimate.TensorSplit)
+	}
+
 	for i := range len(servers) {
 		dir := availableServers[servers[i]]
 		if dir == "" {
@@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 
 		s := &llmServer{
-			port:           port,
-			cmd:            exec.Command(server, finalParams...),
-			status:         NewStatusWriter(os.Stderr),
-			options:        opts,
-			estimatedVRAM:  estimatedVRAM,
-			estimatedTotal: estimatedTotal,
-			sem:            semaphore.NewWeighted(int64(numParallel)),
-			totalLayers:    ggml.KV().BlockCount() + 1,
-			gpuCount:       gpuCount,
-			done:           make(chan error, 1),
+			port:        port,
+			cmd:         exec.Command(server, finalParams...),
+			status:      NewStatusWriter(os.Stderr),
+			options:     opts,
+			estimate:    estimate,
+			sem:         semaphore.NewWeighted(int64(numParallel)),
+			totalLayers: ggml.KV().BlockCount() + 1,
+			gpuCount:    gpuCount,
+			done:        make(chan error, 1),
 		}
 
 		s.cmd.Env = os.Environ()
@@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error {
 }
 
 func (s *llmServer) EstimatedVRAM() uint64 {
-	return s.estimatedVRAM
+	return s.estimate.VRAMSize
 }
 
 func (s *llmServer) EstimatedTotal() uint64 {
-	return s.estimatedTotal
+	return s.estimate.TotalSize
 }
 
 func parseDurationMs(ms float64) time.Duration {
diff --git a/server/sched_test.go b/server/sched_test.go
index f7dce6d1..e7ea5874 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		"tokenizer.ggml.token_type":     []int32{0},
 	}, []llm.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 	})
 	require.NoError(t, err)