From ddf5c09a9b9692cf8cced925c7267917a4a6a07d Mon Sep 17 00:00:00 2001 From: jmorganca Date: Thu, 25 Apr 2024 00:33:33 -0400 Subject: [PATCH] use matrix multiplcation kernels in more cases --- llm/patches/04-metal.diff | 45 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 llm/patches/04-metal.diff diff --git a/llm/patches/04-metal.diff b/llm/patches/04-metal.diff new file mode 100644 index 00000000..f8fa7db7 --- /dev/null +++ b/llm/patches/04-metal.diff @@ -0,0 +1,45 @@ +diff --git a/ggml-metal.m b/ggml-metal.m +index 0207b787..b5e9884b 100644 +--- a/ggml-metal.m ++++ b/ggml-metal.m +@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute( + // to the matrix-vector kernel + int ne11_mm_min = 1; + +-#if 0 + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach +- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { +- switch (src0t) { +- case GGML_TYPE_F16: ne11_mm_min = 2; break; +- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; +- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; +- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; +- case GGML_TYPE_Q4_0: +- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; +- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; +- case GGML_TYPE_Q5_0: // not tested yet +- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet +- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; +- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; +- default: ne11_mm_min = 1; break; +- } ++ switch (src0t) { ++ case GGML_TYPE_F16: ne11_mm_min = 2; break; ++ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; ++ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q4_0: ++ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; ++ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; ++ case GGML_TYPE_Q5_0: // not tested yet ++ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet ++ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; ++ default: ne11_mm_min = 1; break; + } +-#endif + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel