From 4613a080e70a965f35efb28fcbf6c776cdfa0e32 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 20 Feb 2024 17:42:31 -0500 Subject: [PATCH] update llama.cpp submodule to `66c1968f7` (#2618) --- llm/dyn_ext_server.go | 7 +- llm/ext_server/ext_server.cpp | 5 +- llm/ext_server/ext_server.h | 2 +- llm/llama.cpp | 2 +- .../{03-cudaleaks.diff => 02-cudaleaks.diff} | 57 ++++++----- llm/patches/02-shutdown.diff | 96 ------------------- 6 files changed, 39 insertions(+), 130 deletions(-) rename llm/patches/{03-cudaleaks.diff => 02-cudaleaks.diff} (72%) delete mode 100644 llm/patches/02-shutdown.diff diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go index 45b8da12..8d7ebf9e 100644 --- a/llm/dyn_ext_server.go +++ b/llm/dyn_ext_server.go @@ -106,7 +106,12 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts sparams.memory_f16 = C.bool(opts.F16KV) sparams.use_mlock = C.bool(opts.UseMLock) sparams.use_mmap = C.bool(opts.UseMMap) - sparams.numa = C.bool(opts.UseNUMA) + + if opts.UseNUMA { + sparams.numa = C.int(1) + } else { + sparams.numa = C.int(0) + } sparams.lora_adapters = nil for i := 0; i < len(adapters); i++ { diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp index 376bc44b..f077d73e 100644 --- a/llm/ext_server/ext_server.cpp +++ b/llm/ext_server/ext_server.cpp @@ -80,7 +80,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { params.main_gpu = sparams->main_gpu; params.use_mlock = sparams->use_mlock; params.use_mmap = sparams->use_mmap; - params.numa = sparams->numa; + params.numa = (ggml_numa_strategy)sparams->numa; params.embedding = sparams->embedding; if (sparams->model != NULL) { params.model = sparams->model; @@ -111,7 +111,8 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { } #endif - llama_backend_init(params.numa); + llama_backend_init(); + llama_numa_init(params.numa); // load the model if (!llama->load_model(params)) { diff --git a/llm/ext_server/ext_server.h b/llm/ext_server/ext_server.h index 8eefb3cc..9b9ce2ec 100644 --- a/llm/ext_server/ext_server.h +++ b/llm/ext_server/ext_server.h @@ -41,7 +41,7 @@ typedef struct ext_server_params { int32_t main_gpu; // the GPU that is used for scratch and small tensors bool use_mlock; // force system to keep model in RAM bool use_mmap; // use mmap if possible - bool numa; // attempt optimizations that help on some NUMA systems + int numa; // attempt optimizations that help on some NUMA systems bool embedding; // get only sentence embedding ext_server_lora_adapter_t *lora_adapters; char *mmproj; diff --git a/llm/llama.cpp b/llm/llama.cpp index 6c00a066..66c1968f 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 6c00a066928b0475b865a2e3e709e2166e02d548 +Subproject commit 66c1968f7a2e895675425e875b6589f1233a1b52 diff --git a/llm/patches/03-cudaleaks.diff b/llm/patches/02-cudaleaks.diff similarity index 72% rename from llm/patches/03-cudaleaks.diff rename to llm/patches/02-cudaleaks.diff index 674f8b1a..111fc83d 100644 --- a/llm/patches/03-cudaleaks.diff +++ b/llm/patches/02-cudaleaks.diff @@ -1,30 +1,29 @@ diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 3102762c..568ac1d0 100644 +index 7800c6e7..be30db23 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp -@@ -307,6 +307,10 @@ struct llama_client_slot - } - }; +@@ -30,6 +30,10 @@ + #include + #include +#ifdef GGML_USE_CUBLAS +extern "C" GGML_CALL void ggml_free_cublas(void); +#endif + - struct llama_server_context - { - llama_model *model = nullptr; -@@ -353,6 +357,10 @@ struct llama_server_context + using json = nlohmann::json; + + struct server_params +@@ -353,6 +357,9 @@ struct llama_server_context llama_free_model(model); model = nullptr; } +#ifdef GGML_USE_CUBLAS + ggml_free_cublas(); +#endif -+ } bool load_model(const gpt_params ¶ms_) -@@ -3093,6 +3101,7 @@ int main(int argc, char **argv) +@@ -3143,6 +3150,7 @@ int main(int argc, char **argv) sigemptyset (&sigint_action.sa_mask); sigint_action.sa_flags = 0; sigaction(SIGINT, &sigint_action, NULL); @@ -32,13 +31,8 @@ index 3102762c..568ac1d0 100644 #elif defined (_WIN32) auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; -@@ -3106,3 +3115,4 @@ int main(int argc, char **argv) - llama_backend_free(); - return 0; - } -+ diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 96976f24..3543920e 100644 +index 933ebbc4..88a4f664 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -39,6 +39,7 @@ @@ -49,30 +43,30 @@ index 96976f24..3543920e 100644 #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) { +@@ -7991,10 +7992,10 @@ GGML_CALL bool ggml_cublas_loaded(void) { return g_cublas_loaded; } -+static bool g_cublas_initialized = false; -+ - GGML_CALL void ggml_init_cublas() { +-GGML_CALL void ggml_init_cublas() { - static bool initialized = false; ++static bool g_cublas_initialized = false; - if (!initialized) { ++GGML_CALL void ggml_init_cublas() { + if (!g_cublas_initialized) { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: -@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() { +@@ -8004,7 +8005,7 @@ GGML_CALL void ggml_init_cublas() { #endif if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) { - initialized = true; + g_cublas_initialized = true; g_cublas_loaded = false; + fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); return; - } -@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() { +@@ -8075,7 +8076,7 @@ GGML_CALL void ggml_init_cublas() { // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); @@ -81,25 +75,30 @@ index 96976f24..3543920e 100644 g_cublas_loaded = true; } } -@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { +@@ -11604,3 +11605,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { } return device_count; } + ++ +extern "C" GGML_CALL void ggml_free_cublas(void); +GGML_CALL void ggml_free_cublas(void) { + for (int id = 0; id < g_device_count; ++id) { -+#if !defined(GGML_USE_HIPBLAS) -+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id])); -+ g_cuda_pool_size[id] = 0; -+ g_cuda_pool_addr[id] = 0; ++#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) ++ if (g_device_caps[id].vmm) { ++ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id])); ++ g_cuda_pool_size[id] = 0; ++ g_cuda_pool_addr[id] = 0; ++ } +#endif ++ // TODO: free legacy non-vmm memory ++ // destroy cublas handle + CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id])); + g_cublas_handles[id] = nullptr; + } ++ + g_cublas_initialized = false; +} -\ No newline at end of file diff --git a/ggml-cuda.h b/ggml-cuda.h index b1ebd61d..b4c80c2c 100644 --- a/ggml-cuda.h diff --git a/llm/patches/02-shutdown.diff b/llm/patches/02-shutdown.diff deleted file mode 100644 index fc13e328..00000000 --- a/llm/patches/02-shutdown.diff +++ /dev/null @@ -1,96 +0,0 @@ -diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index a0b46970..7800c6e7 100644 ---- a/examples/server/server.cpp -+++ b/examples/server/server.cpp -@@ -28,6 +28,7 @@ - #include - #include - #include -+#include - - using json = nlohmann::json; - -@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con - } - } - -+std::function shutdown_handler; -+inline void signal_handler(int signal) { shutdown_handler(signal); } -+ - int main(int argc, char **argv) - { - #if SERVER_VERBOSE != 1 -@@ -3128,8 +3132,25 @@ int main(int argc, char **argv) - std::placeholders::_2, - std::placeholders::_3 - )); -- llama.queue_tasks.start_loop(); - -+ shutdown_handler = [&](int) { -+ llama.queue_tasks.terminate(); -+ }; -+ -+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) -+ struct sigaction sigint_action; -+ sigint_action.sa_handler = signal_handler; -+ sigemptyset (&sigint_action.sa_mask); -+ sigint_action.sa_flags = 0; -+ sigaction(SIGINT, &sigint_action, NULL); -+#elif defined (_WIN32) -+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { -+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; -+ }; -+ SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); -+#endif -+ llama.queue_tasks.start_loop(); -+ svr.stop(); - t.join(); - - llama_backend_free(); -diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp -index 54854896..0ee670db 100644 ---- a/examples/server/utils.hpp -+++ b/examples/server/utils.hpp -@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector messages) - struct llama_server_queue { - int id = 0; - std::mutex mutex_tasks; -+ bool running; - // queues - std::vector queue_tasks; - std::vector queue_tasks_deferred; -@@ -278,9 +279,18 @@ struct llama_server_queue { - queue_tasks_deferred.clear(); - } - -- // Start the main loop. This call is blocking -- [[noreturn]] -+ // end the start_loop routine -+ void terminate() { -+ { -+ std::unique_lock lock(mutex_tasks); -+ running = false; -+ } -+ condition_tasks.notify_all(); -+ } -+ -+ // Start the main loop. - void start_loop() { -+ running = true; - while (true) { - // new task arrived - LOG_VERBOSE("have new task", {}); -@@ -324,8 +334,12 @@ struct llama_server_queue { - { - std::unique_lock lock(mutex_tasks); - if (queue_tasks.empty()) { -+ if (!running) { -+ LOG_VERBOSE("ending start_loop", {}); -+ return; -+ } - condition_tasks.wait(lock, [&]{ -- return !queue_tasks.empty(); -+ return (!queue_tasks.empty() || !running); - }); - } - }