diff --git a/docs/development.md b/docs/development.md index 54d910e4..85cf34c6 100644 --- a/docs/development.md +++ b/docs/development.md @@ -35,5 +35,5 @@ Now you can run `ollama`: ## Building on Linux with GPU support - Install cmake and nvidia-cuda-toolkit -- run `CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\)\.\([0-9]\+\).*$/\1/p') go generate ./...` +- run `go generate ./...` - run `go build .` diff --git a/llm/llama.cpp/generate_darwin_amd64.go b/llm/llama.cpp/generate_darwin_amd64.go index a8b4f0ad..9b782db3 100644 --- a/llm/llama.cpp/generate_darwin_amd64.go +++ b/llm/llama.cpp/generate_darwin_amd64.go @@ -3,11 +3,10 @@ package llm //go:generate git submodule init //go:generate git submodule update --force ggml -//go:generate -command git-apply git -C ggml apply -//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch -//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch -//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch -//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch +//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate cmake --build ggml/build/cpu --target server --config Release diff --git a/llm/llama.cpp/generate_darwin_arm64.go b/llm/llama.cpp/generate_darwin_arm64.go index 4923fefb..72d175ef 100644 --- a/llm/llama.cpp/generate_darwin_arm64.go +++ b/llm/llama.cpp/generate_darwin_arm64.go @@ -3,11 +3,10 @@ package llm //go:generate git submodule init //go:generate git submodule update --force ggml -//go:generate -command git-apply git -C ggml apply -//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch -//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch -//go:generate git-apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch -//go:generate git-apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch +//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 //go:generate cmake --build ggml/build/metal --target server --config Release diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index 7436391f..76be15d5 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -3,19 +3,19 @@ package llm //go:generate git submodule init //go:generate git submodule update --force ggml -//go:generate -command git-apply git -C ggml apply -//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch -//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch -//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch - +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on //go:generate cmake --build ggml/build/cpu --target server --config Release //go:generate git submodule update --force gguf +//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on //go:generate cmake --build gguf/build/cpu --target server --config Release -//go:generate cmake -S ggml -B ggml/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cuda-${CUDA_VERSION} --target server --config Release -//go:generate cmake -S gguf -B gguf/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build gguf/build/cuda-${CUDA_VERSION} --target server --config Release +//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build ggml/build/cuda --target server --config Release +//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake --build gguf/build/cuda --target server --config Release diff --git a/llm/llama.cpp/generate.go b/llm/llama.cpp/generate_windows.go similarity index 65% rename from llm/llama.cpp/generate.go rename to llm/llama.cpp/generate_windows.go index 40a42708..0d8cd411 100644 --- a/llm/llama.cpp/generate.go +++ b/llm/llama.cpp/generate_windows.go @@ -1,14 +1,10 @@ -//go:build !darwin -// +build !darwin - package llm //go:generate git submodule init //go:generate git submodule update --force ggml -//go:generate -command git-apply git -C ggml apply -//go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch -//go:generate git-apply ../ggml_patch/0002-34B-model-support.patch +//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch +//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch //go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on //go:generate cmake --build ggml/build/cpu --target server --config Release diff --git a/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch b/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch deleted file mode 100644 index 870e982a..00000000 --- a/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001 -From: Bruce MacDonald -Date: Tue, 5 Sep 2023 16:05:08 -0400 -Subject: [PATCH] metal: add missing barriers for mul-mat #2699 - ---- - ggml-metal.metal | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/ggml-metal.metal b/ggml-metal.metal -index 3f31252..ce3541f 100644 ---- a/ggml-metal.metal -+++ b/ggml-metal.metal -@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0, - //load data and store to threadgroup memory - half4x4 temp_a; - dequantize_func(x, il, temp_a); -+ threadgroup_barrier(mem_flags::mem_threadgroup); - #pragma unroll(16) - for (int i = 0; i < 16; i++) { - *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ -@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0, - } - } else { - // block is smaller than 64x32, we should avoid writing data outside of the matrix -+ threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { --- -2.39.2 (Apple Git-143) - diff --git a/llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch b/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch similarity index 100% rename from llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch rename to llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch diff --git a/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch new file mode 100644 index 00000000..1fd07973 --- /dev/null +++ b/llm/llama.cpp/patches/0001-copy-cuda-runtime-libraries.patch @@ -0,0 +1,27 @@ +From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Wed, 20 Sep 2023 14:19:52 -0700 +Subject: [PATCH] copy cuda runtime libraries + +--- + CMakeLists.txt | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 824d9f2..dd24137 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) + endif() + ++ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY) ++ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY) ++ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY) ++ + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # 52 == lowest CUDA 12 standard + # 60 == f16 CUDA intrinsics +-- +2.42.0 + diff --git a/llm/llama.cpp/ggml_patch/0002-34B-model-support.patch b/llm/llama.cpp/patches/0002-34B-model-support.patch similarity index 100% rename from llm/llama.cpp/ggml_patch/0002-34B-model-support.patch rename to llm/llama.cpp/patches/0002-34B-model-support.patch diff --git a/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch b/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch similarity index 100% rename from llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch rename to llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch diff --git a/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch b/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch similarity index 100% rename from llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch rename to llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch diff --git a/llm/llama.cpp/ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch b/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch similarity index 100% rename from llm/llama.cpp/ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch rename to llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch diff --git a/llm/llama.go b/llm/llama.go index 2390f653..9118da2a 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -17,7 +17,6 @@ import ( "os/exec" "path" "path/filepath" - "regexp" "runtime" "strconv" "strings" @@ -29,46 +28,6 @@ import ( //go:embed llama.cpp/*/build/*/bin/* var llamaCppEmbed embed.FS -func cudaVersion() int { - // first try nvcc, it gives the most accurate version if available - cmd := exec.Command("nvcc", "--version") - output, err := cmd.CombinedOutput() - if err == nil { - // regex to match the CUDA version line in nvcc --version output - re := regexp.MustCompile(`release (\d+\.\d+),`) - matches := re.FindStringSubmatch(string(output)) - if len(matches) >= 2 { - cudaVersion := matches[1] - cudaVersionParts := strings.Split(cudaVersion, ".") - cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0]) - if err == nil { - return cudaMajorVersion - } - } - } - - // fallback to nvidia-smi - cmd = exec.Command("nvidia-smi") - output, err = cmd.CombinedOutput() - if err != nil { - return -1 - } - - re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`) - matches := re.FindStringSubmatch(string(output)) - if len(matches) < 2 { - return -1 - } - - cudaVersion := matches[1] - cudaVersionParts := strings.Split(cudaVersion, ".") - cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0]) - if err != nil { - return -1 - } - return cudaMajorVersion -} - type ModelRunner struct { Path string // path to the model runner executable } @@ -86,20 +45,9 @@ func chooseRunners(runnerType string) []ModelRunner { path.Join(buildPath, "cpu", "bin", "server"), } case "linux": - cuda := cudaVersion() - if cuda == 11 { - // prioritize CUDA 11 runner - runners = []string{ - path.Join(buildPath, "cuda-11", "bin", "server"), - path.Join(buildPath, "cuda-12", "bin", "server"), - path.Join(buildPath, "cpu", "bin", "server"), - } - } else { - runners = []string{ - path.Join(buildPath, "cuda-12", "bin", "server"), - path.Join(buildPath, "cuda-11", "bin", "server"), - path.Join(buildPath, "cpu", "bin", "server"), - } + runners = []string{ + path.Join(buildPath, "cuda", "bin", "server"), + path.Join(buildPath, "cpu", "bin", "server"), } case "windows": // TODO: select windows GPU runner here when available @@ -353,7 +301,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, opts api.O runner.Path, append(params, "--port", strconv.Itoa(port))..., ) - + cmd.Env = append(os.Environ(), fmt.Sprintf("LD_LIBRARY_PATH=%s", filepath.Dir(runner.Path))) cmd.Stdout = os.Stderr cmd.Stderr = os.Stderr diff --git a/server/routes.go b/server/routes.go index d3d3d11c..79d2ee72 100644 --- a/server/routes.go +++ b/server/routes.go @@ -556,7 +556,7 @@ func Serve(ln net.Listener, origins []string) error { if runtime.GOOS == "linux" { // check compatibility to log warnings if _, err := llm.CheckVRAM(); err != nil { - log.Printf("Warning: GPU support not enabled, you may need to install GPU drivers: %v", err) + log.Printf("Warning: GPU support may not enabled, check you have installed install GPU drivers: %v", err) } }