ollama/llm/llm.go

package llm

import (
	"context"
	"fmt"
	"log"
	"os"
	"runtime"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/gpu"
)

type LLM interface {
	Predict(context.Context, PredictOpts, func(PredictResult)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
	Close()
}

func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	ggml, err := DecodeGGML(f)
	if err != nil {
		return nil, err
	}

	if opts.NumCtx < 4 {
		opts.NumCtx = 4
	}

	fmt.Println("size", ggml.Size)
	fmt.Println("filetype", ggml.FileType())
	fmt.Println("architecture", ggml.ModelFamily())
	fmt.Println("type", ggml.ModelType())
	fmt.Println("name", ggml.Name())
	fmt.Println("embd", ggml.NumEmbed())
	fmt.Println("head", ggml.NumHead())
	fmt.Println("head_kv", ggml.NumHeadKv())
	fmt.Println("gqa", ggml.NumGQA())

	available, _ := gpu.CheckVRAM()

	// For now assume filesize = model size
	// TODO: use actual model size
	requiredModel := ggml.Size

	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
	requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())

	// this amount is the overhead + tensors in memory
	// TODO: get this from the llama.cpp's graph calcluations instead of
	// estimating it's 1/6 * kv_cache_size * num_gqa
	requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6

	requiredTotal := requiredModel + requiredKv + requiredAlloc

	log.Println("system memory bytes:", available)
	log.Println("required model bytes:", requiredModel)
	log.Println("required kv bytes:", requiredKv)
	log.Println("required alloc bytes:", requiredAlloc)
	log.Println("required total bytes:", requiredTotal)

	info := gpu.GetGPUInfo()
	library := info.Library

	if opts.NumGPU == -1 {
		// default to offloading all layers
		opts.NumGPU = int(ggml.NumLayers()) + 1
	}

	// decide how many layers to put on the GPU
	if opts.NumGPU > 0 {
		switch runtime.GOOS {
		case "darwin":
			if requiredTotal > available {
				log.Println("not enough vram available, falling back to CPU only")
				opts.NumGPU = 0
			}
		default:
			if library == "cpu" || library == "default" {
				opts.NumGPU = 0
				break
			}

			// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
			// TODO: find the largest GPU and only reserve memory there
			avgAvailable := available / int64(info.DeviceCount)
			if requiredAlloc > avgAvailable {
				log.Printf("not enough vram available, falling back to CPU only")
				library = "cpu"
				opts.NumGPU = 0
				break
			}

			// we don't know which GPU will be used, so estimate
			// the scratch buffer space on all of them
			// TODO: allocate less layers to the GPU with the scratch buffer
			// and more to the others (based on their available memory)
			available -= requiredAlloc * int64(info.DeviceCount)

			// no offloading required
			if requiredModel+requiredKv <= available {
				break
			}

			// fill remaining vram with layers
			log.Println("splitting", available, "of available memory bytes into layers")
			bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
			log.Println("bytes per layer:", bytesPerLayer)
			layers := available / bytesPerLayer
			log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
			if layers < int64(opts.NumGPU) {
				opts.NumGPU = int(layers)
			}
		}
	}

	opts.NumGQA = 0
	opts.RopeFrequencyBase = 0.0
	opts.RopeFrequencyScale = 0.0
	gpuInfo := gpu.GetGPUInfo()
	return newLlmServer(gpuInfo, model, adapters, projectors, opts)
}

// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
	return nativeInit(workdir)
}

func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
	for _, shim := range getShims(gpuInfo) {
		if shim == "default" {
			break
		}
		srv, err := newDynamicShimExtServer(shim, model, adapters, projectors, opts)
		if err == nil {
			return srv, nil
		}
		log.Printf("Failed to load dynamic library %s  %s", shim, err)
	}

	return newDefaultExtServer(model, adapters, projectors, opts)

}
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`package llm`

			`import (`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 16:35:03 -04:00			`"context"`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`"fmt"`
disable gpu for q5_0, q5_1, q8_0 quants 2023-08-03 18:40:16 -04:00			`"log"`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`"os"`
enable q8, q5, 5_1, and f32 for linux gpu (#699) 2023-10-05 12:53:47 -04:00			`"runtime"`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00
			`"github.com/jmorganca/ollama/api"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 14:00:37 -05:00			`"github.com/jmorganca/ollama/gpu"`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`)`

			`type LLM interface {`
chat api endpoint (#1392) 2023-12-05 14:57:33 -05:00			`Predict(context.Context, PredictOpts, func(PredictResult)) error`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 16:35:03 -04:00			`Embedding(context.Context, string) ([]float64, error)`
			`Encode(context.Context, string) ([]int, error)`
			`Decode(context.Context, []int) (string, error)`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`Close()`
			`}`

load projectors 2023-11-30 13:30:23 -05:00			`func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`if _, err := os.Stat(model); err != nil {`
			`return nil, err`
			`}`

			`f, err := os.Open(model)`
			`if err != nil {`
			`return nil, err`
			`}`
close open files 2023-08-14 19:08:02 -04:00			`defer f.Close()`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00
GGUF support (#441) 2023-09-07 13:55:37 -04:00			`ggml, err := DecodeGGML(f)`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`if err != nil {`
			`return nil, err`
			`}`

Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 16:42:00 -05:00			`if opts.NumCtx < 4 {`
			`opts.NumCtx = 4`
			`}`

			`fmt.Println("size", ggml.Size)`
			`fmt.Println("filetype", ggml.FileType())`
			`fmt.Println("architecture", ggml.ModelFamily())`
			`fmt.Println("type", ggml.ModelType())`
			`fmt.Println("name", ggml.Name())`
			`fmt.Println("embd", ggml.NumEmbed())`
			`fmt.Println("head", ggml.NumHead())`
			`fmt.Println("head_kv", ggml.NumHeadKv())`
			`fmt.Println("gqa", ggml.NumGQA())`

			`available, _ := gpu.CheckVRAM()`

			`// For now assume filesize = model size`
			`// TODO: use actual model size`
			`requiredModel := ggml.Size`

			`// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value`
			`requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())`
check total (system + video) memory 2023-10-12 13:36:23 -04:00
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 16:42:00 -05:00			`// this amount is the overhead + tensors in memory`
			`// TODO: get this from the llama.cpp's graph calcluations instead of`
better estimate scratch buffer size 2024-01-08 21:32:44 -05:00			`// estimating it's 1/6 * kv_cache_size * num_gqa`
			`requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6`
check total (system + video) memory 2023-10-12 13:36:23 -04:00
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 16:42:00 -05:00			`requiredTotal := requiredModel + requiredKv + requiredAlloc`

			`log.Println("system memory bytes:", available)`
			`log.Println("required model bytes:", requiredModel)`
			`log.Println("required kv bytes:", requiredKv)`
			`log.Println("required alloc bytes:", requiredAlloc)`
			`log.Println("required total bytes:", requiredTotal)`

			`info := gpu.GetGPUInfo()`
			`library := info.Library`

			`if opts.NumGPU == -1 {`
			`// default to offloading all layers`
			`opts.NumGPU = int(ggml.NumLayers()) + 1`
			`}`

			`// decide how many layers to put on the GPU`
			`if opts.NumGPU > 0 {`
			`switch runtime.GOOS {`
			`case "darwin":`
			`if requiredTotal > available {`
			`log.Println("not enough vram available, falling back to CPU only")`
			`opts.NumGPU = 0`
			`}`
			`default:`
			`if library == "cpu" \|\| library == "default" {`
			`opts.NumGPU = 0`
			`break`
			`}`

Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896) * increase minimum cuda overhead and fix minimum overhead for multi-gpu * fix multi gpu overhead * limit overhead to 10% of all gpus * better wording * allocate fixed amount before layers * fixed only includes graph alloc 2024-01-10 19:08:51 -05:00			`// alloc buffer and kv cache is allocated as a fixed amount on the main gpu`
			`// TODO: find the largest GPU and only reserve memory there`
			`avgAvailable := available / int64(info.DeviceCount)`
			`if requiredAlloc > avgAvailable {`
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 16:42:00 -05:00			`log.Printf("not enough vram available, falling back to CPU only")`
			`library = "cpu"`
			`opts.NumGPU = 0`
			`break`
			`}`

Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896) * increase minimum cuda overhead and fix minimum overhead for multi-gpu * fix multi gpu overhead * limit overhead to 10% of all gpus * better wording * allocate fixed amount before layers * fixed only includes graph alloc 2024-01-10 19:08:51 -05:00			`// we don't know which GPU will be used, so estimate`
			`// the scratch buffer space on all of them`
			`// TODO: allocate less layers to the GPU with the scratch buffer`
			`// and more to the others (based on their available memory)`
			`available -= requiredAlloc * int64(info.DeviceCount)`

			`// no offloading required`
			`if requiredModel+requiredKv <= available {`
			`break`
			`}`
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 16:42:00 -05:00
			`// fill remaining vram with layers`
			`log.Println("splitting", available, "of available memory bytes into layers")`
			`bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))`
			`log.Println("bytes per layer:", bytesPerLayer)`
			`layers := available / bytesPerLayer`
use 10% vram overhead for cuda 2024-01-08 23:17:44 -05:00			`log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))`
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 16:42:00 -05:00			`if layers < int64(opts.NumGPU) {`
			`opts.NumGPU = int(layers)`
			`}`
only check system memory on macos 2023-10-13 17:41:51 -04:00			`}`
check memory requirements before loading 2023-08-03 18:47:36 -04:00			`}`

deprecate ggml - remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com> 2023-11-24 13:58:09 -05:00			`opts.NumGQA = 0`
			`opts.RopeFrequencyBase = 0.0`
			`opts.RopeFrequencyScale = 0.0`
Support multiple variants for a given llm lib type In some cases we may want multiple variants for a given GPU type or CPU. This adds logic to have an optional Variant which we can use to select an optimal library, but also allows us to try multiple variants in case some fail to load. This can be useful for scenarios such as ROCm v5 vs v6 incompatibility or potentially CPU features. 2024-01-05 15:13:08 -05:00			`gpuInfo := gpu.GetGPUInfo()`
			`return newLlmServer(gpuInfo, model, adapters, projectors, opts)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 14:00:37 -05:00			`}`

			`// Give any native cgo implementations an opportunity to initialize`
			`func Init(workdir string) error {`
			`return nativeInit(workdir)`
partial decode ggml bin for more info 2023-07-21 16:33:56 -04:00			`}`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 13:36:01 -05:00
Support multiple variants for a given llm lib type In some cases we may want multiple variants for a given GPU type or CPU. This adds logic to have an optional Variant which we can use to select an optimal library, but also allows us to try multiple variants in case some fail to load. This can be useful for scenarios such as ROCm v5 vs v6 incompatibility or potentially CPU features. 2024-01-05 15:13:08 -05:00			`func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {`
			`for _, shim := range getShims(gpuInfo) {`
			`if shim == "default" {`
			`break`
			`}`
			`srv, err := newDynamicShimExtServer(shim, model, adapters, projectors, opts)`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 13:36:01 -05:00			`if err == nil {`
			`return srv, nil`
			`}`
Support multiple variants for a given llm lib type In some cases we may want multiple variants for a given GPU type or CPU. This adds logic to have an optional Variant which we can use to select an optimal library, but also allows us to try multiple variants in case some fail to load. This can be useful for scenarios such as ROCm v5 vs v6 incompatibility or potentially CPU features. 2024-01-05 15:13:08 -05:00			`log.Printf("Failed to load dynamic library %s %s", shim, err)`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 13:36:01 -05:00			`}`

Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 16:42:00 -05:00			`return newDefaultExtServer(model, adapters, projectors, opts)`
Support multiple variants for a given llm lib type In some cases we may want multiple variants for a given GPU type or CPU. This adds logic to have an optional Variant which we can use to select an optimal library, but also allows us to try multiple variants in case some fail to load. This can be useful for scenarios such as ROCm v5 vs v6 incompatibility or potentially CPU features. 2024-01-05 15:13:08 -05:00
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 13:36:01 -05:00			`}`