From 325d74985b9f31917ead1585ea22389a39b280b5 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 21 Dec 2023 16:23:36 -0800 Subject: [PATCH] Fix CPU performance on hyperthreaded systems The default thread count logic was broken and resulted in 2x the number of threads as it should on a hyperthreading CPU resulting in thrashing and poor performance. --- llm/ext_server.go | 7 +------ .../0001-Expose-callable-API-for-server.patch | 14 ++++++++------ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/llm/ext_server.go b/llm/ext_server.go index 45251cc5..0d3327da 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -37,7 +37,6 @@ import ( "fmt" "log" "os" - "runtime" "strings" "sync" "time" @@ -185,11 +184,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string, sparams.mmproj = nil } - if opts.NumThread > 0 { - sparams.n_threads = C.uint(opts.NumThread) - } else { - sparams.n_threads = C.uint(runtime.NumCPU()) - } + sparams.n_threads = C.uint(opts.NumThread) log.Printf("Initializing internal llama server") resp := newExtServerResp(128) diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch index 2e5a981e..07e42972 100644 --- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -1,4 +1,4 @@ -From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001 +From 7184ae16e8fd0e9e91cac4c81daa323057fa992b Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 12:25:58 -0800 Subject: [PATCH] Expose callable API for server @@ -6,10 +6,10 @@ Subject: [PATCH] Expose callable API for server This adds an extern "C" interface within the example server --- examples/server/CMakeLists.txt | 24 +++ - examples/server/server.cpp | 274 +++++++++++++++++++++++++++++++++ + examples/server/server.cpp | 276 +++++++++++++++++++++++++++++++++ examples/server/server.h | 89 +++++++++++ ggml-cuda.cu | 1 + - 4 files changed, 388 insertions(+) + 4 files changed, 390 insertions(+) create mode 100644 examples/server/server.h diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt @@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644 +endif() \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 0403853..2084fd8 100644 +index 0403853..065420c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ @@ -67,7 +67,7 @@ index 0403853..2084fd8 100644 int main(int argc, char **argv) { #if SERVER_VERBOSE != 1 -@@ -3123,3 +3127,273 @@ int main(int argc, char **argv) +@@ -3123,3 +3127,275 @@ int main(int argc, char **argv) llama_backend_free(); return 0; } @@ -89,7 +89,9 @@ index 0403853..2084fd8 100644 + gpt_params params; + params.n_ctx = sparams->n_ctx; + params.n_batch = sparams->n_batch; -+ params.n_threads = sparams->n_threads; ++ if (sparams->n_threads > 0) { ++ params.n_threads = sparams->n_threads; ++ } + params.n_parallel = sparams->n_parallel; + params.rope_freq_base = sparams->rope_freq_base; + params.rope_freq_scale = sparams->rope_freq_scale;