ollama/llm/llm.go

156 lines
4.4 KiB
Go
Raw Normal View History

2023-07-21 16:33:56 -04:00
package llm
import (
"context"
2023-07-21 16:33:56 -04:00
"fmt"
"log"
2023-07-21 16:33:56 -04:00
"os"
"runtime"
2023-07-21 16:33:56 -04:00
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
2023-07-21 16:33:56 -04:00
)
type LLM interface {
2023-12-05 14:57:33 -05:00
Predict(context.Context, PredictOpts, func(PredictResult)) error
Embedding(context.Context, string) ([]float64, error)
Encode(context.Context, string) ([]int, error)
Decode(context.Context, []int) (string, error)
2023-07-21 16:33:56 -04:00
Close()
}
2023-11-30 13:30:23 -05:00
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
2023-07-21 16:33:56 -04:00
if _, err := os.Stat(model); err != nil {
return nil, err
}
f, err := os.Open(model)
if err != nil {
return nil, err
}
2023-08-14 19:08:02 -04:00
defer f.Close()
2023-07-21 16:33:56 -04:00
2023-09-07 13:55:37 -04:00
ggml, err := DecodeGGML(f)
2023-07-21 16:33:56 -04:00
if err != nil {
return nil, err
}
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
fmt.Println("size", ggml.Size)
fmt.Println("filetype", ggml.FileType())
fmt.Println("architecture", ggml.ModelFamily())
fmt.Println("type", ggml.ModelType())
fmt.Println("name", ggml.Name())
fmt.Println("embd", ggml.NumEmbed())
fmt.Println("head", ggml.NumHead())
fmt.Println("head_kv", ggml.NumHeadKv())
fmt.Println("gqa", ggml.NumGQA())
available, _ := gpu.CheckVRAM()
// For now assume filesize = model size
// TODO: use actual model size
requiredModel := ggml.Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
2023-10-12 13:36:23 -04:00
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calcluations instead of
2024-01-08 21:32:44 -05:00
// estimating it's 1/6 * kv_cache_size * num_gqa
requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
2023-10-12 13:36:23 -04:00
requiredTotal := requiredModel + requiredKv + requiredAlloc
log.Println("system memory bytes:", available)
log.Println("required model bytes:", requiredModel)
log.Println("required kv bytes:", requiredKv)
log.Println("required alloc bytes:", requiredAlloc)
log.Println("required total bytes:", requiredTotal)
info := gpu.GetGPUInfo()
library := info.Library
if opts.NumGPU == -1 {
// default to offloading all layers
opts.NumGPU = int(ggml.NumLayers()) + 1
}
// decide how many layers to put on the GPU
if opts.NumGPU > 0 {
switch runtime.GOOS {
case "darwin":
if requiredTotal > available {
log.Println("not enough vram available, falling back to CPU only")
opts.NumGPU = 0
}
default:
if library == "cpu" || library == "default" {
opts.NumGPU = 0
break
}
// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
// TODO: find the largest GPU and only reserve memory there
avgAvailable := available / int64(info.DeviceCount)
if requiredAlloc > avgAvailable {
log.Printf("not enough vram available, falling back to CPU only")
library = "cpu"
opts.NumGPU = 0
break
}
// we don't know which GPU will be used, so estimate
// the scratch buffer space on all of them
// TODO: allocate less layers to the GPU with the scratch buffer
// and more to the others (based on their available memory)
available -= requiredAlloc * int64(info.DeviceCount)
// no offloading required
if requiredModel+requiredKv <= available {
break
}
// fill remaining vram with layers
log.Println("splitting", available, "of available memory bytes into layers")
bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
log.Println("bytes per layer:", bytesPerLayer)
layers := available / bytesPerLayer
2024-01-08 23:17:44 -05:00
log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
if layers < int64(opts.NumGPU) {
opts.NumGPU = int(layers)
}
2023-10-13 17:41:51 -04:00
}
}
opts.NumGQA = 0
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
gpuInfo := gpu.GetGPUInfo()
return newLlmServer(gpuInfo, model, adapters, projectors, opts)
}
// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
return nativeInit(workdir)
2023-07-21 16:33:56 -04:00
}
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
for _, shim := range getShims(gpuInfo) {
if shim == "default" {
break
}
srv, err := newDynamicShimExtServer(shim, model, adapters, projectors, opts)
if err == nil {
return srv, nil
}
log.Printf("Failed to load dynamic library %s %s", shim, err)
}
return newDefaultExtServer(model, adapters, projectors, opts)
}