From d18282bfda97fcde66465a78f3bc869ebe13db4b Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Tue, 5 Sep 2023 19:37:13 -0400 Subject: [PATCH] metal: add missing barriers for mul-mat (#469) --- llm/llama.cpp/generate.go | 4 +- llm/llama.cpp/generate_darwin_amd64.go | 2 + llm/llama.cpp/generate_darwin_arm64.go | 4 +- ...dd-missing-barriers-for-mul-mat-2699.patch | 32 +++++++++++++++ ...onization-in-new-matrix-multiplicati.patch | 30 ++++++++++++++ ...dd-missing-barriers-for-mul-mat-2699.patch | 41 +++++++++++++++++++ 6 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch create mode 100644 llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch create mode 100644 llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch diff --git a/llm/llama.cpp/generate.go b/llm/llama.cpp/generate.go index dbbf05b7..ae0582d0 100644 --- a/llm/llama.cpp/generate.go +++ b/llm/llama.cpp/generate.go @@ -7,5 +7,7 @@ package llm //go:generate git submodule update --force ggml //go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch -//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on +//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch +//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch +//go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on //go:generate cmake --build ggml/build/cpu --target server --config Release diff --git a/llm/llama.cpp/generate_darwin_amd64.go b/llm/llama.cpp/generate_darwin_amd64.go index 1d7f94ae..c2fd9f3e 100644 --- a/llm/llama.cpp/generate_darwin_amd64.go +++ b/llm/llama.cpp/generate_darwin_amd64.go @@ -4,5 +4,7 @@ package llm //go:generate git submodule update --force ggml //go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch +//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch +//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch //go:generate cmake --fresh -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 //go:generate cmake --build ggml/build/cpu --target server --config Release diff --git a/llm/llama.cpp/generate_darwin_arm64.go b/llm/llama.cpp/generate_darwin_arm64.go index 934c943f..4f48ebb8 100644 --- a/llm/llama.cpp/generate_darwin_arm64.go +++ b/llm/llama.cpp/generate_darwin_arm64.go @@ -4,5 +4,7 @@ package llm //go:generate git submodule update --force ggml //go:generate git -C ggml apply ../ggml_patch/0001-add-detokenize-endpoint.patch //go:generate git -C ggml apply ../ggml_patch/0002-34B-model-support.patch -//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 +//go:generate git -C ggml apply ../ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch +//go:generate git -C ggml apply ../ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch +//go:generate cmake --fresh -S ggml -B ggml/build/gpu -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 //go:generate cmake --build ggml/build/gpu --target server --config Release diff --git a/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch b/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch new file mode 100644 index 00000000..870e982a --- /dev/null +++ b/llm/llama.cpp/ggml_patch/0003-metal-add-missing-barriers-for-mul-mat-2699.patch @@ -0,0 +1,32 @@ +From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001 +From: Bruce MacDonald +Date: Tue, 5 Sep 2023 16:05:08 -0400 +Subject: [PATCH] metal: add missing barriers for mul-mat #2699 + +--- + ggml-metal.metal | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/ggml-metal.metal b/ggml-metal.metal +index 3f31252..ce3541f 100644 +--- a/ggml-metal.metal ++++ b/ggml-metal.metal +@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0, + //load data and store to threadgroup memory + half4x4 temp_a; + dequantize_func(x, il, temp_a); ++ threadgroup_barrier(mem_flags::mem_threadgroup); + #pragma unroll(16) + for (int i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ +@@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0, + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix ++ threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { +-- +2.39.2 (Apple Git-143) + diff --git a/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch b/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch new file mode 100644 index 00000000..e5540ab1 --- /dev/null +++ b/llm/llama.cpp/ggml_patch/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch @@ -0,0 +1,30 @@ +From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001 +From: Shouzheng Liu +Date: Mon, 21 Aug 2023 06:59:29 -0400 +Subject: [PATCH] metal : fix synchronization in new matrix multiplication + kernel (#2686) + +--- + ggml-metal.metal | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/ggml-metal.metal b/ggml-metal.metal +index 3f31252..88d48f6 100644 +--- a/ggml-metal.metal ++++ b/ggml-metal.metal +@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0, + threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { ++ threadgroup_barrier(mem_flags::mem_device); + simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + } + +- threadgroup_barrier(mem_flags::mem_threadgroup); ++ threadgroup_barrier(mem_flags::mem_device); + device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg==0) { + for (int i = 0; i < n_rows; i++) { +-- +2.41.0 + diff --git a/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch b/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch new file mode 100644 index 00000000..a2649097 --- /dev/null +++ b/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch @@ -0,0 +1,41 @@ +From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001 +From: Shouzheng Liu +Date: Tue, 22 Aug 2023 02:18:40 -0400 +Subject: [PATCH] metal : add missing barriers for mul-mat (#2699) + +--- + ggml-metal.metal | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/ggml-metal.metal b/ggml-metal.metal +index 88d48f6..ce3541f 100644 +--- a/ggml-metal.metal ++++ b/ggml-metal.metal +@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0, + //load data and store to threadgroup memory + half4x4 temp_a; + dequantize_func(x, il, temp_a); ++ threadgroup_barrier(mem_flags::mem_threadgroup); + #pragma unroll(16) + for (int i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ +@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0, + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix ++ threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { +- threadgroup_barrier(mem_flags::mem_device); + simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + } + +- threadgroup_barrier(mem_flags::mem_device); ++ threadgroup_barrier(mem_flags::mem_threadgroup); + device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg==0) { + for (int i = 0; i < n_rows; i++) { +-- +2.41.0 +