diff --git a/format/bytes.go b/format/bytes.go index a0b8008d..f4bcc8c5 100644 --- a/format/bytes.go +++ b/format/bytes.go @@ -50,7 +50,7 @@ func HumanBytes(b int64) string { } } -func HumanBytes2(b int64) string { +func HumanBytes2(b uint64) string { switch { case b >= MebiByte: return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) diff --git a/gpu/gpu.go b/gpu/gpu.go index 708fad10..47d70ed0 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) { return ret, nil } -func CheckVRAM() (int64, error) { +func CheckVRAM() (uint64, error) { userLimit := os.Getenv("OLLAMA_MAX_VRAM") if userLimit != "" { avail, err := strconv.ParseInt(userLimit, 10, 64) @@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) { return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) } slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) - return avail, nil + return uint64(avail), nil } gpuInfo := GetGPUInfo() if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { - return int64(gpuInfo.FreeMemory), nil + return gpuInfo.FreeMemory, nil } return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index f2856e29..824f43bf 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -17,7 +17,7 @@ import ( ) // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs -func CheckVRAM() (int64, error) { +func CheckVRAM() (uint64, error) { userLimit := os.Getenv("OLLAMA_MAX_VRAM") if userLimit != "" { avail, err := strconv.ParseInt(userLimit, 10, 64) @@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) { return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) } slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) - return avail, nil + return uint64(avail), nil } if runtime.GOARCH == "amd64" { // gpu not supported, this may not be metal return 0, nil } - recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM()) - return recommendedMaxVRAM, nil + return uint64(C.getRecommendedMaxVRAM()), nil } func GetGPUInfo() GpuInfo { diff --git a/gpu/types.go b/gpu/types.go index 6191e196..7fe6c40c 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -15,7 +15,7 @@ type GpuInfo struct { Variant string `json:"variant,omitempty"` // MinimumMemory represents the minimum memory required to use the GPU - MinimumMemory int64 `json:"-"` + MinimumMemory uint64 `json:"-"` // TODO add other useful attributes about the card here for discovery information } diff --git a/llm/ggla.go b/llm/ggla.go index 591d5e00..cf14f214 100644 --- a/llm/ggla.go +++ b/llm/ggla.go @@ -49,7 +49,7 @@ func (llm *ggla) KV() KV { return llm.kv } -func (llm *ggla) Tensors() []*Tensor { +func (llm *ggla) Tensors() Tensors { return llm.tensors } diff --git a/llm/ggml.go b/llm/ggml.go index 071a36c3..ff9380f1 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -13,16 +13,6 @@ type GGML struct { model } -func (ggml *GGML) LayerSize(prefix string) (n int64) { - for _, t := range ggml.Tensors() { - if strings.HasPrefix(t.Name, prefix) { - n += int64(t.size()) - } - } - - return -} - const ( fileTypeF32 uint32 = iota fileTypeF16 @@ -101,7 +91,7 @@ func fileType(fileType uint32) string { type model interface { KV() KV - Tensors() []*Tensor + Tensors() Tensors } type KV map[string]any @@ -167,6 +157,36 @@ func (kv KV) ContextLength() uint64 { return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture())) } +type Tensors []*Tensor + +func (ts Tensors) Layers() map[string]Layer { + layers := make(map[string]Layer) + for _, t := range ts { + parts := strings.Split(t.Name, ".") + if parts[0] == "blk" { + parts = parts[1:] + } + + if _, ok := layers[parts[0]]; !ok { + layers[parts[0]] = make(Layer) + } + + layers[parts[0]][strings.Join(parts[1:], ".")] = t + } + + return layers +} + +type Layer map[string]*Tensor + +func (l Layer) size() (size uint64) { + for _, t := range l { + size += t.size() + } + + return size +} + type Tensor struct { Name string `json:"name"` Kind uint32 `json:"kind"` @@ -304,49 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { }, offset, nil } -func (llm GGML) GraphSize(context, batch int) (int64, bool) { - embeddingLength := llm.KV().EmbeddingLength() - headCount := llm.KV().HeadCount() - headCountKV := llm.KV().HeadCountKV() - vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any)) - - var attnQKVWeight1 uint64 = 0 - for _, t := range llm.Tensors() { - if strings.HasSuffix(t.Name, ".attn_qkv.weight") && len(t.Shape) >= 2 { - attnQKVWeight1 = t.Shape[1] - break - } - } - - var ffnGate1 uint64 = 0 - for _, t := range llm.Tensors() { - if strings.Index(t.Name, ".ffn_gate") > 0 && len(t.Shape) >= 2 { - ffnGate1 = t.Shape[1] - break - } - } +func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) { + embedding := llm.KV().EmbeddingLength() + heads := llm.KV().HeadCount() + headsKV := llm.KV().HeadCountKV() + vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any))) switch llm.KV().Architecture() { - case "gemma", "command-r": - return 4 * int64(batch) * int64(embeddingLength+uint64(vocabLength)), true - case "phi2": - return max( - 4*int64(batch)*int64(embeddingLength+uint64(vocabLength)), - 4*int64(batch)*int64(1+4*embeddingLength+uint64(context)+attnQKVWeight1+uint64(context)*headCount), - ), true - case "qwen2": - return max( - 4*int64(batch)*int64(embeddingLength+uint64(vocabLength)), - 4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount), - ), true case "llama": - if ffnGate1 > 0 { - // moe - return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate1), true - } - - return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true + fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads)) + + partialOffload = 4 * batch * embedding + partialOffload += max( + 4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV), + 4*batch*(embedding+vocab)+embedding*vocab*105/128, + ) + case "gemma": + fullOffload = 4 * batch * (embedding + vocab) + partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128 + case "command-r": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(2+4*embedding+context*(1+heads)), + ) + + partialOffload = max( + 4*batch*(embedding+vocab)+embedding*vocab*105/128, + 4*batch*(1+2*embedding+context*(1+heads))+ 4*embedding*context+embedding*embedding*9/16, + ) + case "qwen2": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(1+2*embedding+context+context*heads), + ) + + partialOffload = max( + 4*batch*(embedding+vocab)+embedding*vocab*105/128, + 4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)), + ) + case "phi2": + fullOffload = max( + 4*batch*(embedding+vocab), + 4*batch*(1+4*embedding+context+context*heads), + ) + + partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128 } - return 0, false + return } diff --git a/llm/gguf.go b/llm/gguf.go index 7d804712..796642e3 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -109,7 +109,7 @@ func (llm *gguf) KV() KV { return llm.kv } -func (llm *gguf) Tensors() []*Tensor { +func (llm *gguf) Tensors() Tensors { return llm.tensors } diff --git a/llm/server.go b/llm/server.go index 160effe7..707f0b8b 100644 --- a/llm/server.go +++ b/llm/server.go @@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{ } func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) { - if _, err := os.Stat(model); err != nil { - return nil, err - } - f, err := os.Open(model) if err != nil { return nil, err @@ -65,67 +61,79 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option opts.NumCtx = 4 } - availableMemory, _ := gpu.CheckVRAM() + memoryAvailable, _ := gpu.CheckVRAM() info := gpu.GetGPUInfo() - usedMemory := info.MinimumMemory + memoryMinimum := info.MinimumMemory for _, projector := range projectors { - usedMemory += projectorMemoryRequirements(projector) + memoryMinimum += projectorMemoryRequirements(projector) // multimodal models require at least 2048 context opts.NumCtx = max(opts.NumCtx, 2048) } // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv - kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV()) + var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() - graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch)) - if !ok { - graph = int64(ggml.KV().GQA()) * kv / 6 + graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch))) + if graphPartialOffload == 0 { + graphPartialOffload = ggml.KV().GQA() * kv / 6 } - usedMemory += graph - - if (usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture())) && info.Library != "metal" { - info.Library = "cpu" + if graphFullOffload == 0 { + graphFullOffload = graphPartialOffload } - requiredMemory := usedMemory + // memoryRequiredTotal represents the memory required for full GPU offloading (all layers) + memoryRequiredTotal := memoryMinimum + graphFullOffload - var layers int - for i := 0; i < int(ggml.KV().BlockCount()); i++ { - layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount()) - requiredMemory += layerMemory + // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers) + memoryRequiredPartial := memoryMinimum + graphPartialOffload - if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { - usedMemory += layerMemory - layers++ + if info.Library != "metal" { + if memoryRequiredPartial > memoryAvailable || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) { + info.Library = "cpu" } } - memOutputLayer := ggml.LayerSize("output.") - requiredMemory += memOutputLayer + var layerCount int + layers := ggml.Tensors().Layers() + for i := 0; i < int(ggml.KV().BlockCount()); i++ { + memoryLayer := layers[fmt.Sprintf("%d", i)].size() - // only offload output layer if all repeating layers are offloaded - if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer { - usedMemory += memOutputLayer - layers++ + // KV is proportional to the number of layers + memoryLayer += kv / ggml.KV().BlockCount() + + memoryRequiredTotal += memoryLayer + if memoryAvailable > memoryRequiredPartial+memoryLayer { + memoryRequiredPartial += memoryLayer + layerCount++ + } + } + + memoryLayerOutput := layers["output"].size() + memoryRequiredTotal += memoryLayerOutput + if memoryAvailable > memoryRequiredTotal { + layerCount = int(ggml.KV().BlockCount()) + 1 + memoryRequiredPartial = memoryRequiredTotal + } + + if opts.NumGPU < 0 { + opts.NumGPU = layerCount } slog.Info( "offload to gpu", - "layers", layers, - "required", format.HumanBytes2(requiredMemory), - "used", format.HumanBytes2(usedMemory), - "available", format.HumanBytes2(availableMemory), + "reallayers", opts.NumGPU, + "layers", layerCount, + "required", format.HumanBytes2(memoryRequiredTotal), + "used", format.HumanBytes2(memoryRequiredPartial), + "available", format.HumanBytes2(memoryAvailable), "kv", format.HumanBytes2(kv), - "graph", format.HumanBytes2(graph), + "fulloffload", format.HumanBytes2(graphFullOffload), + "partialoffload", format.HumanBytes2(graphPartialOffload), ) - if opts.NumGPU < 0 && info.Library != "cpu" { - opts.NumGPU = layers - } - if len(adapters) > 1 { return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") } @@ -282,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option return nil, finalErr } -func projectorMemoryRequirements(filename string) int64 { +func projectorMemoryRequirements(filename string) uint64 { file, err := os.Open(filename) if err != nil { return 0 @@ -294,18 +302,12 @@ func projectorMemoryRequirements(filename string) int64 { return 0 } - prefixes := make(map[string]struct{}) - for _, layer := range ggml.Tensors() { - parts := strings.Split(layer.Name, ".") - prefixes[strings.Join(parts[:2], ".")] = struct{}{} + var mem uint64 + for _, layer := range ggml.Tensors().Layers() { + mem += layer.size() } - var ask int64 - for prefix := range prefixes { - ask += ggml.LayerSize(prefix) - } - - return ask + return mem } type ServerStatus int