diff --git a/docs/development.md b/docs/development.md index 85cf34c6..54d910e4 100644 --- a/docs/development.md +++ b/docs/development.md @@ -35,5 +35,5 @@ Now you can run `ollama`: ## Building on Linux with GPU support - Install cmake and nvidia-cuda-toolkit -- run `go generate ./...` +- run `CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\)\.\([0-9]\+\).*$/\1/p') go generate ./...` - run `go build .` diff --git a/llm/ggml.go b/llm/ggml.go index e95f5fc6..61f2c9c3 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -4,7 +4,6 @@ import ( "encoding/binary" "errors" "io" - "path" "sync" ) @@ -166,11 +165,6 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) { return nil, nil } -var ( - ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin") - ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin") -) - var ( ggmlInit sync.Once ggmlRunnerPath string @@ -178,7 +172,7 @@ var ( func ggmlRunner() ModelRunner { ggmlInit.Do(func() { - ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU) + ggmlRunnerPath = chooseRunner("ggml") }) return ModelRunner{Path: ggmlRunnerPath} } diff --git a/llm/gguf.go b/llm/gguf.go index 7680c90c..7a1cf1ba 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -6,7 +6,6 @@ import ( "errors" "fmt" "io" - "path" "sync" ) @@ -370,11 +369,6 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) { return } -var ( - ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin") - ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin") -) - var ( ggufInit sync.Once ggufRunnerPath string @@ -382,7 +376,7 @@ var ( func ggufRunner() ModelRunner { ggufInit.Do(func() { - ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU) + ggufRunnerPath = chooseRunner("gguf") }) return ModelRunner{Path: ggufRunnerPath} diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index 71ebbdd4..7436391f 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -7,9 +7,15 @@ package llm //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch //go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch -//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/gpu --target server --config Release + +//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/cpu --target server --config Release //go:generate git submodule update --force gguf -//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build gguf/build/gpu --target server --config Release +//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on +//go:generate cmake --build gguf/build/cpu --target server --config Release + +//go:generate cmake -S ggml -B ggml/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/cuda-${CUDA_VERSION} --target server --config Release +//go:generate cmake -S gguf -B gguf/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build gguf/build/cuda-${CUDA_VERSION} --target server --config Release diff --git a/llm/llama.go b/llm/llama.go index 4b7e6e01..8810665d 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -17,6 +17,7 @@ import ( "os/exec" "path" "path/filepath" + "regexp" "runtime" "strconv" "strings" @@ -36,36 +37,99 @@ func osPath(llamaPath string) string { return llamaPath } -func chooseRunner(gpuPath, cpuPath string) string { +func cudaVersion() (int, error) { + // first try nvcc, it gives the most accurate version if available + cmd := exec.Command("nvcc", "--version") + output, err := cmd.CombinedOutput() + if err == nil { + // regex to match the CUDA version line in nvcc --version output + re := regexp.MustCompile(`release (\d+\.\d+),`) + matches := re.FindStringSubmatch(string(output)) + if len(matches) >= 2 { + cudaVersion := matches[1] + cudaVersionParts := strings.Split(cudaVersion, ".") + cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0]) + if err == nil { + return cudaMajorVersion, nil + } + } + } + + // fallback to nvidia-smi + cmd = exec.Command("nvidia-smi") + output, err = cmd.CombinedOutput() + if err != nil { + return -1, err + } + + re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`) + matches := re.FindStringSubmatch(string(output)) + if len(matches) < 2 { + return -1, errors.New("could not find CUDA version") + } + + cudaVersion := matches[1] + cudaVersionParts := strings.Split(cudaVersion, ".") + cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0]) + if err != nil { + return -1, err + } + return cudaMajorVersion, nil +} + +func chooseRunner(runnerType string) string { tmpDir, err := os.MkdirTemp("", "llama-*") if err != nil { log.Fatalf("llama.cpp: failed to create temp dir: %v", err) } - llamaPath := osPath(gpuPath) + cpuPath := osPath(path.Join("llama.cpp", runnerType, "build", "cpu", "bin")) + llamaPath := cpuPath + files := []string{"server"} + + // Set OS specific llama.cpp runner paths + switch runtime.GOOS { + case "darwin": + // TODO: change to check metal version + llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", "gpu", "bin")) + files = append(files, "ggml-metal.metal") + case "linux": + cudaVersion, err := cudaVersion() + if err != nil { + // fallback to CPU runner in the following the CUDA version check + log.Printf("failed to get CUDA version: %v", err) + } + + switch cudaVersion { + case 11, 12: + cudaDir := fmt.Sprintf("cuda-%d", cudaVersion) + llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", cudaDir, "bin")) + default: + if cudaVersion != -1 { + // a valid version was returned but it is not supported + log.Printf("CUDA version %d not supported, falling back to CPU", cudaVersion) + } + llamaPath = cpuPath + } + case "windows": + // TODO: select windows GPU runner here when available + files = []string{"server.exe"} + default: + log.Printf("unknown OS, running on CPU: %s", runtime.GOOS) + } + + // check if the runner exists, if not fallback to CPU runner if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil { - llamaPath = osPath(cpuPath) + // fallback to CPU runner + llamaPath = cpuPath + files = []string{"server"} if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil { log.Fatalf("llama.cpp executable not found") } + log.Printf("llama.cpp %s executable not found, falling back to cpu", runnerType) } - files := []string{"server"} - switch runtime.GOOS { - case "windows": - files = []string{"server.exe"} - case "darwin": - if llamaPath == osPath(gpuPath) { - files = append(files, "ggml-metal.metal") - } - case "linux": - // check if there is a GPU available - if _, err := CheckVRAM(); errors.Is(err, errNoGPU) { - // this error was logged on start-up, so we don't need to log it again - llamaPath = osPath(cpuPath) - } - } - + // copy the files locally to run the llama.cpp server for _, f := range files { srcPath := path.Join(llamaPath, f) destPath := filepath.Join(tmpDir, f)