diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp index 0d879e1e..7a6558b4 100644 --- a/llm/ext_server/ext_server.cpp +++ b/llm/ext_server/ext_server.cpp @@ -125,7 +125,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { return; } - llama->initialize(); + llama->init(); } catch (std::exception &e) { err->id = -1; snprintf(err->msg, err->msg_len, "exception %s", e.what()); diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 4b806b02..e0040453 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -60,12 +60,19 @@ case "${GOARCH}" in compress_libs ;; "arm64") - CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}" + # bundle ggml-common.h and ggml-metal.metal into a single file + grep -v '#include "ggml-common.h"' "${LLAMACPP_DIR}/ggml-metal.metal" | grep -v '#pragma once' > "${LLAMACPP_DIR}/ggml-metal.metal.temp" + echo '#define GGML_COMMON_IMPL_METAL' > "${LLAMACPP_DIR}/ggml-metal.metal" + cat "${LLAMACPP_DIR}/ggml-common.h" | grep -v '#pragma once' >> "${LLAMACPP_DIR}/ggml-metal.metal" + cat "${LLAMACPP_DIR}/ggml-metal.metal.temp" >> "${LLAMACPP_DIR}/ggml-metal.metal" + rm "${LLAMACPP_DIR}/ggml-metal.metal.temp" + CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}" BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" build sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib compress_libs + (cd ${LLAMACPP_DIR} && git checkout ggml-metal.metal) ;; *) echo "GOARCH must be set" diff --git a/llm/llama.cpp b/llm/llama.cpp index c2101a2e..77d1ac7e 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit c2101a2e909ac7c08976d414e64e96c90ee5fa9e +Subproject commit 77d1ac7e00bf049b9f2bba1b5a310a78318c49c4 diff --git a/llm/patches/02-cudaleaks.diff b/llm/patches/02-cudaleaks.diff index 0c4298ba..79053f0e 100644 --- a/llm/patches/02-cudaleaks.diff +++ b/llm/patches/02-cudaleaks.diff @@ -1,10 +1,10 @@ diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index f255ad76..5b83acb1 100644 +index b14cca61..02bfd4b1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp -@@ -28,6 +28,10 @@ - #include +@@ -29,6 +29,10 @@ #include + #include +#ifdef GGML_USE_CUBLAS +extern "C" GGML_CALL void ggml_free_cublas(void); @@ -13,7 +13,7 @@ index f255ad76..5b83acb1 100644 using json = nlohmann::json; bool server_verbose = false; -@@ -648,6 +652,10 @@ struct server_context { +@@ -664,6 +668,10 @@ struct server_context { llama_free_model(model); model = nullptr; } @@ -24,7 +24,7 @@ index f255ad76..5b83acb1 100644 } bool load_model(const gpt_params & params_) { -@@ -3339,6 +3347,7 @@ int main(int argc, char ** argv) { +@@ -3499,6 +3507,7 @@ int main(int argc, char ** argv) { sigemptyset (&sigint_action.sa_mask); sigint_action.sa_flags = 0; sigaction(SIGINT, &sigint_action, NULL); @@ -33,10 +33,10 @@ index f255ad76..5b83acb1 100644 auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 72bcec8c..50a45e3d 100644 +index c207ff87..945708a4 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu -@@ -43,6 +43,7 @@ +@@ -46,6 +46,7 @@ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate @@ -44,7 +44,7 @@ index 72bcec8c..50a45e3d 100644 #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) { +@@ -8014,10 +8015,10 @@ GGML_CALL bool ggml_cublas_loaded(void) { return g_cublas_loaded; } @@ -58,7 +58,7 @@ index 72bcec8c..50a45e3d 100644 #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: -@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() { +@@ -8027,7 +8028,7 @@ GGML_CALL void ggml_init_cublas() { #endif if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) { @@ -67,7 +67,7 @@ index 72bcec8c..50a45e3d 100644 g_cublas_loaded = false; fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); return; -@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() { +@@ -8098,7 +8099,7 @@ GGML_CALL void ggml_init_cublas() { // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); @@ -76,11 +76,12 @@ index 72bcec8c..50a45e3d 100644 g_cublas_loaded = true; } } -@@ -12490,3 +12491,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { +@@ -11753,3 +11754,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { } return device_count; } + ++ +extern "C" GGML_CALL void ggml_free_cublas(void); +GGML_CALL void ggml_free_cublas(void) { + for (int id = 0; id < g_device_count; ++id) { diff --git a/llm/payload_common.go b/llm/payload_common.go index 500d0582..010ba73d 100644 --- a/llm/payload_common.go +++ b/llm/payload_common.go @@ -108,20 +108,8 @@ func nativeInit() error { if err != nil { return err } - slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir)) - if runtime.GOOS == "darwin" { - err := extractPayloadFiles(payloadsDir, "llama.cpp/ggml-metal.metal") - if err != nil { - if err == payloadMissing { - // TODO perhaps consider this a hard failure on arm macs? - slog.Info("ggml-meta.metal payload missing") - return nil - } - return err - } - os.Setenv("GGML_METAL_PATH_RESOURCES", payloadsDir) - } + slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir)) libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*") if err != nil { @@ -211,44 +199,6 @@ func extractDynamicLibs(payloadsDir, glob string) ([]string, error) { return libs, g.Wait() } -func extractPayloadFiles(payloadsDir, glob string) error { - files, err := fs.Glob(libEmbed, glob) - if err != nil || len(files) == 0 { - return payloadMissing - } - - for _, file := range files { - srcFile, err := libEmbed.Open(file) - if err != nil { - return fmt.Errorf("read payload %s: %v", file, err) - } - defer srcFile.Close() - if err := os.MkdirAll(payloadsDir, 0o755); err != nil { - return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err) - } - src := io.Reader(srcFile) - filename := file - if strings.HasSuffix(file, ".gz") { - src, err = gzip.NewReader(src) - if err != nil { - return fmt.Errorf("decompress payload %s: %v", file, err) - } - filename = strings.TrimSuffix(filename, ".gz") - } - - destFile := filepath.Join(payloadsDir, filepath.Base(filename)) - destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - return fmt.Errorf("write payload %s: %v", file, err) - } - defer destFp.Close() - if _, err := io.Copy(destFp, src); err != nil { - return fmt.Errorf("copy payload %s: %v", file, err) - } - } - return nil -} - func verifyDriverAccess() error { if runtime.GOOS != "linux" { return nil diff --git a/llm/payload_darwin_amd64.go b/llm/payload_darwin_amd64.go index a1c70ba9..dfeeb9cf 100644 --- a/llm/payload_darwin_amd64.go +++ b/llm/payload_darwin_amd64.go @@ -4,5 +4,5 @@ import ( "embed" ) -//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib* +//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib* var libEmbed embed.FS