diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 1e230611..fc673c47 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -425,7 +425,7 @@ struct llama_server_context n_ctx = llama_n_ctx(ctx); - add_bos_token = llama_should_add_bos_token(model); + add_bos_token = llama_add_bos_token(model); return true; } @@ -1031,7 +1031,7 @@ struct llama_server_context continue; } - if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { + if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) { LOG_TEE("Error processing the given image"); return false; } @@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf("options:\n"); printf(" -h, --help show this help message and exit\n"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads); printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n"); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); @@ -2287,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.n_threads = std::stoi(argv[i]); + params.cpuparams.n_threads = std::stoi(argv[i]); } else if (arg == "--grp-attn-n" || arg == "-gan") { @@ -2315,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.n_threads_batch = std::stoi(argv[i]); + params.cpuparams_batch.n_threads = std::stoi(argv[i]); } else if (arg == "--threads-http") { @@ -2626,6 +2626,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g params.kv_overrides.back().key[0] = 0; } + postprocess_cpu_params(params.cpuparams, nullptr); + postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch); + if (invalid_param) { fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); @@ -2775,8 +2780,8 @@ int main(int argc, char **argv) { {"commit", LLAMA_COMMIT}}); LOG_INFO("system info", { - {"n_threads", params.n_threads}, - {"n_threads_batch", params.n_threads_batch}, + {"n_threads", params.cpuparams.n_threads}, + {"n_threads_batch", params.cpuparams_batch.n_threads}, {"total_threads", std::thread::hardware_concurrency()}, {"system_info", llama_print_system_info()}, }); diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index f22c0f8e..acea9c8d 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -19,7 +19,7 @@ sign() { fi } -COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off" +COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off" case "${GOARCH}" in "amd64") diff --git a/llm/llama.cpp b/llm/llama.cpp index 1e6f6554..8962422b 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f +Subproject commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 0d40fc3c..351bcaef 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index a207451f..2ddf431d 100644 +index 88355971..dd7d41ed 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5347,16 +5347,7 @@ static void llm_load_vocab( +@@ -6083,16 +6083,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = true; @@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5443,7 +5434,8 @@ static void llm_load_vocab( - tokenizer_pre == "codeshell") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; +@@ -6188,7 +6179,8 @@ static void llm_load_vocab( + tokenizer_pre == "exaone") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); diff --git a/llm/patches/06-embeddings.diff b/llm/patches/06-embeddings.diff index a84e3b06..f3c071cb 100644 --- a/llm/patches/06-embeddings.diff +++ b/llm/patches/06-embeddings.diff @@ -1,37 +1,36 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 1fe2b9f7..a43312a7 100644 +index 88355971..d7db689b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { +@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { const auto n_embd = hparams.n_embd; // TODO: use a per-batch flag for logits presence instead - const bool has_logits = !cparams.embeddings; + const bool has_logits = cparams.causal_attn; - const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)); + const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; -@@ -13959,17 +13959,25 @@ static int llama_decode_internal( +@@ -16175,20 +16175,23 @@ static int llama_decode_internal( // no output res = nullptr; embd = nullptr; - } else if (cparams.embeddings) { -- res = nullptr; // do not extract logits for embedding case -- embd = gf->nodes[gf->n_nodes - 1]; -- if (strcmp(embd->name, "result_embd_pooled") != 0) { -- embd = gf->nodes[gf->n_nodes - 2]; +- res = nullptr; // do not extract logits for embedding case +- embd = nullptr; + } + + if (cparams.embeddings) { -+ for (int i = gf->n_nodes - 1; i >= 0; --i) { + for (int i = gf->n_nodes - 1; i >= 0; --i) { +- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) { +- embd = gf->nodes[i]; + embd = gf->nodes[i]; + if (strcmp(embd->name, "result_embd_pooled") == 0) { -+ break; -+ } + break; + } } - GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor"); -- } else { -+ } else { +- GGML_ASSERT(embd != nullptr && "missing embeddings tensor"); + } else { embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } @@ -39,7 +38,6 @@ index 1fe2b9f7..a43312a7 100644 + if (!cparams.causal_attn) { + res = nullptr; // do not extract logits when not needed + } -+ // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(lctx.sched, gf); diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff deleted file mode 100644 index 21958476..00000000 --- a/llm/patches/09-lora.diff +++ /dev/null @@ -1,350 +0,0 @@ -diff --git a/common/common.cpp b/common/common.cpp -index 2e8374d5..70d0afde 100644 ---- a/common/common.cpp -+++ b/common/common.cpp -@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { - loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); - if (loaded_la.adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); -- llama_free(lctx); -- llama_free_model(model); -- return iparams; -+ -+ // if that fails, try loading as ggla for compatibility -+ int err = llama_model_apply_lora_from_file(model, -+ la.path.c_str(), -+ la.scale, -+ nullptr, -+ params.n_threads); -+ if (err != 0) { -+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); -+ llama_free(lctx); -+ llama_free_model(model); -+ return iparams; -+ } else { -+ break; -+ } - } - iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters - } -diff --git a/include/llama.h b/include/llama.h -index 93fd77ca..b0fb37a6 100644 ---- a/include/llama.h -+++ b/include/llama.h -@@ -1160,6 +1160,20 @@ extern "C" { - - LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); - -+ // Apply a LoRA adapter to a loaded model -+ // path_base_model is the path to a higher quality model to use as a base for -+ // the layers modified by the adapter. Can be NULL to use the current loaded model. -+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter -+ // will be applied on top of the previous one -+ // Returns 0 on success -+ LLAMA_API int32_t llama_model_apply_lora_from_file( -+ const struct llama_model * model, -+ const char * path_lora, -+ float scale, -+ const char * path_base_model, -+ int32_t n_threads); -+ -+ - #ifdef __cplusplus - } - #endif -diff --git a/src/llama.cpp b/src/llama.cpp -index 80a0dd0f..9d7b0e17 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text, - fputs(text, stderr); - fflush(stderr); - } -+ -+static int llama_apply_lora_from_file_internal( -+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads -+) { -+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); -+ -+ const int64_t t_start_lora_us = ggml_time_us(); -+ -+ llama_file fin(path_lora, "rb"); -+ -+ // verify magic and version -+ { -+ uint32_t magic = fin.read_u32(); -+ if (magic != LLAMA_FILE_MAGIC_GGLA) { -+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); -+ return 1; -+ } -+ -+ uint32_t format_version = fin.read_u32(); -+ if (format_version != 1) { -+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); -+ return 1; -+ } -+ } -+ -+ int32_t lora_r = fin.read_u32(); -+ int32_t lora_alpha = fin.read_u32(); -+ float scaling = scale * (float)lora_alpha / (float)lora_r; -+ -+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); -+ -+ // load base model -+ std::unique_ptr ml; -+ if (path_base_model) { -+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); -+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); -+ ml->init_mappings(/*prefetch*/ false); // no prefetching -+ } -+ -+ struct tensor_meta { -+ std::string name; -+ ggml_type type; -+ int32_t ne[2]; -+ size_t offset; -+ }; -+ std::map tensor_meta_map; -+ -+ // load all tensor meta -+ while (true) { -+ if (fin.tell() == fin.size) { -+ // eof -+ break; -+ } -+ -+ int32_t n_dims; -+ int32_t name_len; -+ int32_t ftype; -+ -+ fin.read_raw(&n_dims, sizeof(n_dims)); -+ fin.read_raw(&name_len, sizeof(name_len)); -+ fin.read_raw(&ftype, sizeof(ftype)); -+ -+ if (n_dims != 1 && n_dims != 2) { -+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); -+ return 1; -+ } -+ -+ int32_t ne[2] = { 1, 1 }; -+ for (int i = 0; i < n_dims; ++i) { -+ fin.read_raw(&ne[i], sizeof(ne[i])); -+ } -+ -+ std::string name; -+ { -+ GGML_ASSERT(name_len < GGML_MAX_NAME); -+ char buf[GGML_MAX_NAME]; -+ fin.read_raw(buf, name_len); -+ name = std::string(buf, name_len); -+ } -+ -+ // check for lora suffix -+ std::string lora_suffix; -+ if (name.length() > 6) { -+ lora_suffix = name.substr(name.length() - 6); -+ } -+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { -+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); -+ return 1; -+ } -+ -+ // tensor type -+ ggml_type wtype; -+ switch (ftype) { -+ case 0: wtype = GGML_TYPE_F32; break; -+ case 1: wtype = GGML_TYPE_F16; break; -+ default: -+ { -+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", -+ __func__, ftype); -+ return 1; -+ } -+ } -+ -+ // data offset -+ size_t offset = fin.tell(); -+ offset = (offset + 31) & -32; -+ -+ // skip tensor data -+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); -+ -+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); -+ } -+ -+ bool warned = false; -+ int n_tensors = 0; -+ -+ // apply -+ ggml_backend_t backend_cpu = ggml_backend_cpu_init(); -+ if (backend_cpu == nullptr) { -+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); -+ return 1; -+ } -+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); -+ -+ std::vector> read_buf; -+ for (const auto & it : model.tensors_by_name) { -+ const std::string & base_name = it.first; -+ ggml_tensor * model_t = it.second; -+ -+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || -+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { -+ continue; -+ } -+ -+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); -+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); -+ -+ ggml_init_params lora_init_params = { -+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), -+ /* .mem_buffer */ nullptr, -+ /* .no_alloc */ true, -+ }; -+ ggml_context * lora_ctx = ggml_init(lora_init_params); -+ if (lora_ctx == nullptr) { -+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); -+ ggml_backend_free(backend_cpu); -+ return 1; -+ } -+ -+ // create tensors -+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); -+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); -+ ggml_set_name(loraA, metaA.name.c_str()); -+ ggml_set_name(loraB, metaB.name.c_str()); -+ -+ ggml_tensor * base_t; -+ if (ml) { -+ if (!ml->get_tensor_meta(base_name.c_str())) { -+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); -+ return 1; -+ } -+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); -+ } else { -+ base_t = ggml_dup_tensor(lora_ctx, model_t); -+ } -+ ggml_set_name(base_t, base_name.c_str()); -+ -+ // allocate in backend buffer -+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); -+ if (lora_buf == nullptr) { -+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); -+ return 1; -+ } -+ -+ // load tensor data -+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { -+ read_buf.resize(ggml_nbytes(tensor)); -+ fin.seek(tensor_meta.offset, SEEK_SET); -+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); -+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); -+ }; -+ load_tensor(metaA, loraA); -+ load_tensor(metaB, loraB); -+ -+ // load base model tensor data -+ if (ml) { -+ ml->load_data_for(base_t); -+ } else { -+ ggml_backend_tensor_copy(model_t, base_t); -+ } -+ -+ if (ggml_is_quantized(base_t->type) && !warned) { -+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " -+ "use a f16 or f32 base model with --lora-base\n", __func__); -+ warned = true; -+ } -+ -+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { -+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" -+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); -+ ggml_free(lora_ctx); -+ ggml_backend_buffer_free(lora_buf); -+ ggml_backend_free(backend_cpu); -+ return 1; -+ } -+ -+ auto build_lora_graph = [&]() { -+ // w = w + BA*s -+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); -+ ggml_set_name(BA, "BA"); -+ -+ if (scaling != 1.0f) { -+ BA = ggml_scale(lora_ctx, BA, scaling); -+ ggml_set_name(BA, "BA_scaled"); -+ } -+ -+ ggml_tensor * r; -+ r = ggml_add_inplace(lora_ctx, base_t, BA); -+ ggml_set_name(r, "r_add"); -+ -+ if (base_t->type != model_t->type) { -+ // convert the result to the model type -+ r = ggml_cast(lora_ctx, r, model_t->type); -+ ggml_set_name(r, "r_cast"); -+ } -+ -+ return r; -+ }; -+ -+ ggml_cgraph * gf = ggml_new_graph(lora_ctx); -+ ggml_tensor * r = build_lora_graph(); -+ ggml_build_forward_expand(gf, r); -+ -+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); -+ if (graph_buf == nullptr) { -+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); -+ ggml_free(lora_ctx); -+ ggml_backend_buffer_free(lora_buf); -+ ggml_backend_free(backend_cpu); -+ return 1; -+ } -+ -+ ggml_backend_graph_compute(backend_cpu, gf); -+ -+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); -+ -+#if 0 -+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU -+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); -+ -+ // sched compute -+ ggml_build_forward_expand(gf, build_graph()); -+ ggml_backend_sched_init_measure(sched, gf); -+ -+ // create the graph again, since the previous one was destroyed by the measure -+ ggml_graph_clear(gf); -+ ggml_build_forward_expand(gf, build_graph()); -+ ggml_backend_sched_graph_compute(sched, gf); -+ ggml_backend_sched_free(sched); -+#endif -+ -+ ggml_backend_buffer_free(lora_buf); -+ ggml_backend_buffer_free(graph_buf); -+ ggml_free(lora_ctx); -+ -+ n_tensors++; -+ if (n_tensors % 4 == 0) { -+ LLAMA_LOG_INFO("."); -+ } -+ } -+ -+ ggml_backend_free(backend_cpu); -+ -+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; -+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); -+ -+ return 0; -+} -+ -+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { -+ try { -+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); -+ } catch (const std::exception & err) { -+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); -+ return 1; -+ } -+} -\ No newline at end of file \ No newline at end of file diff --git a/llm/patches/11-phi3-sliding-window.diff b/llm/patches/11-phi3-sliding-window.diff deleted file mode 100644 index fde3dd21..00000000 --- a/llm/patches/11-phi3-sliding-window.diff +++ /dev/null @@ -1,43 +0,0 @@ -From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001 -From: Michael Yang -Date: Wed, 31 Jul 2024 14:57:04 -0700 -Subject: [PATCH] phi3 sliding window - ---- - src/llama.cpp | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/src/llama.cpp b/src/llama.cpp -index a207451f..f2872d4e 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -4893,7 +4893,7 @@ static void llm_load_hparams( - } break; - case LLM_ARCH_PHI3: - { -- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); -+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - - switch (hparams.n_layer) { -@@ -10762,7 +10762,7 @@ struct llm_build_context { - struct ggml_tensor * inp_pos = build_inp_pos(); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) -- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); -+ struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask(); - - for (int il = 0; il < n_layer; ++il) { - auto residual = inpL; -@@ -10820,7 +10820,7 @@ struct llm_build_context { - - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, -- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il); -+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); - } - - if (il == n_layer - 1) { --- -2.45.2 -