ollama/llm/llm.go
Daniel Hiltgen 39928a42e8 Always dynamically load the llm server library
This switches darwin to dynamic loading, and refactors the code now that no
static linking of the library is used on any platform
2024-01-11 08:42:47 -08:00

168 lines
4.8 KiB
Go

package llm
import (
"context"
"fmt"
"log"
"os"
"runtime"
"github.com/jmorganca/ollama/api"
"github.com/jmorganca/ollama/gpu"
)
type LLM interface {
Predict(context.Context, PredictOpts, func(PredictResult)) error
Embedding(context.Context, string) ([]float64, error)
Encode(context.Context, string) ([]int, error)
Decode(context.Context, []int) (string, error)
Close()
}
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
f, err := os.Open(model)
if err != nil {
return nil, err
}
defer f.Close()
ggml, err := DecodeGGML(f)
if err != nil {
return nil, err
}
if opts.NumCtx < 4 {
opts.NumCtx = 4
}
fmt.Println("size", ggml.Size)
fmt.Println("filetype", ggml.FileType())
fmt.Println("architecture", ggml.ModelFamily())
fmt.Println("type", ggml.ModelType())
fmt.Println("name", ggml.Name())
fmt.Println("embd", ggml.NumEmbed())
fmt.Println("head", ggml.NumHead())
fmt.Println("head_kv", ggml.NumHeadKv())
fmt.Println("gqa", ggml.NumGQA())
available, _ := gpu.CheckVRAM()
// For now assume filesize = model size
// TODO: use actual model size
requiredModel := ggml.Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calcluations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
requiredTotal := requiredModel + requiredKv + requiredAlloc
log.Println("system memory bytes:", available)
log.Println("required model bytes:", requiredModel)
log.Println("required kv bytes:", requiredKv)
log.Println("required alloc bytes:", requiredAlloc)
log.Println("required total bytes:", requiredTotal)
info := gpu.GetGPUInfo()
library := info.Library
if opts.NumGPU == -1 {
// default to offloading all layers
opts.NumGPU = int(ggml.NumLayers()) + 1
}
// decide how many layers to put on the GPU
if opts.NumGPU > 0 {
switch runtime.GOOS {
case "darwin":
if requiredTotal > available {
log.Println("not enough vram available, falling back to CPU only")
opts.NumGPU = 0
}
default:
if library == "cpu" || library == "default" {
opts.NumGPU = 0
break
}
// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
// TODO: find the largest GPU and only reserve memory there
avgAvailable := available / int64(info.DeviceCount)
if requiredAlloc > avgAvailable {
log.Printf("not enough vram available, falling back to CPU only")
library = "cpu"
opts.NumGPU = 0
break
}
// we don't know which GPU will be used, so estimate
// the scratch buffer space on all of them
// TODO: allocate less layers to the GPU with the scratch buffer
// and more to the others (based on their available memory)
available -= requiredAlloc * int64(info.DeviceCount)
// no offloading required
if requiredModel+requiredKv <= available {
break
}
// fill remaining vram with layers
log.Println("splitting", available, "of available memory bytes into layers")
bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
log.Println("bytes per layer:", bytesPerLayer)
layers := available / bytesPerLayer
log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
if layers < int64(opts.NumGPU) {
opts.NumGPU = int(layers)
}
}
}
opts.NumGQA = 0
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
gpuInfo := gpu.GetGPUInfo()
return newLlmServer(gpuInfo, model, adapters, projectors, opts)
}
// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
return nativeInit(workdir)
}
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
dynLibs := getDynLibs(gpuInfo)
// Check to see if the user has requested a specific library instead of auto-detecting
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
if demandLib != "" {
libPath := availableDynLibs[demandLib]
if libPath == "" {
log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
} else {
log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
dynLibs = []string{libPath}
}
}
err2 := fmt.Errorf("unable to locate suitable llm library")
for _, dynLib := range dynLibs {
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
if err == nil {
return srv, nil
}
log.Printf("Failed to load dynamic library %s %s", dynLib, err)
err2 = err
}
return nil, err2
}