From f59c4d03f789f745ffce6c4b68fcf218fac85435 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Tue, 12 Sep 2023 17:06:48 -0400 Subject: [PATCH] fix ggml arm64 cuda build (#520) --- llm/llama.cpp/generate_linux.go | 5 +-- ...DA-s-half-type-for-aarch64-1455-2670.patch | 32 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 llm/llama.cpp/ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index 74d09afb..71ebbdd4 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -6,9 +6,10 @@ package llm //go:generate -command git-apply git -C ggml apply //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch -//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on //go:generate cmake --build ggml/build/gpu --target server --config Release //go:generate git submodule update --force gguf -//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on +//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on //go:generate cmake --build gguf/build/gpu --target server --config Release diff --git a/llm/llama.cpp/ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch b/llm/llama.cpp/ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch new file mode 100644 index 00000000..7b67f680 --- /dev/null +++ b/llm/llama.cpp/ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch @@ -0,0 +1,32 @@ +From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001 +From: Kylin <56434533+KyL0N@users.noreply.github.com> +Date: Tue, 22 Aug 2023 15:14:23 +0800 +Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670) + +* ggml: support CUDA's half type for aarch64(#1455) +support CUDA's half type for aarch64 in ggml_fp16_t definition + +* ggml: use __CUDACC__ to recognise nvcc compiler +--- + ggml.h | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/ggml.h b/ggml.h +index 544ad2d..0ec7ec5 100644 +--- a/ggml.h ++++ b/ggml.h +@@ -259,8 +259,9 @@ + extern "C" { + #endif + +-#ifdef __ARM_NEON +- // we use the built-in 16-bit float type ++#if defined(__ARM_NEON) && defined(__CUDACC__) ++ typedef half ggml_fp16_t; ++#elif defined(__ARM_NEON) + typedef __fp16 ggml_fp16_t; + #else + typedef uint16_t ggml_fp16_t; +-- +2.39.2 (Apple Git-143) +