diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 071fe1e7..c65901c7 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -403,7 +403,9 @@ struct llama_server_context } } - std::tie(model, ctx) = llama_init_from_gpt_params(params); + auto init_result = llama_init_from_gpt_params(params); + model = init_result.model; + ctx = init_result.context; if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); @@ -2422,7 +2424,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + }); params.use_mmap = false; } else if (arg == "--lora-scaled") @@ -2438,7 +2443,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g invalid_param = true; break; } - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]) + }); params.use_mmap = false; } else if (arg == "-v" || arg == "--verbose") diff --git a/llm/llama.cpp b/llm/llama.cpp index 6eeaeba1..1e6f6554 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972 +Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff index 10c66d1d..21958476 100644 --- a/llm/patches/09-lora.diff +++ b/llm/patches/09-lora.diff @@ -1,40 +1,32 @@ diff --git a/common/common.cpp b/common/common.cpp -index dbb724fb..c26fe6ee 100644 +index 2e8374d5..70d0afde 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); -+ -+ // try to load as gguf - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { -- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); +@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { + loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (loaded_la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_free_model(model); -- return std::make_tuple(nullptr, nullptr); -+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__); +- return iparams; + + // if that fails, try loading as ggla for compatibility + int err = llama_model_apply_lora_from_file(model, -+ lora_adapter.c_str(), -+ lora_scale, ++ la.path.c_str(), ++ la.scale, + nullptr, + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + llama_free(lctx); + llama_free_model(model); -+ return std::make_tuple(nullptr, nullptr); ++ return iparams; ++ } else { ++ break; + } -+ } else { -+ llama_lora_adapter_set(lctx, adapter, lora_scale); } -- llama_lora_adapter_set(lctx, adapter, lora_scale); + iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters } - - if (params.ignore_eos) { diff --git a/include/llama.h b/include/llama.h index 93fd77ca..b0fb37a6 100644 --- a/include/llama.h @@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644 + return 1; + } +} -\ No newline at end of file +\ No newline at end of file \ No newline at end of file diff --git a/llm/patches/10-params.diff b/llm/patches/10-params.diff deleted file mode 100644 index 56699b8e..00000000 --- a/llm/patches/10-params.diff +++ /dev/null @@ -1,20 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index a207451f..fba6b175 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -4969,6 +4969,7 @@ static void llm_load_hparams( - hparams.attn_soft_cap = true; - - switch (hparams.n_layer) { -+ case 26: model.type = e_model::MODEL_2B; break; - case 42: model.type = e_model::MODEL_9B; break; - case 46: model.type = e_model::MODEL_27B; break; - default: model.type = e_model::MODEL_UNKNOWN; -@@ -11736,6 +11737,7 @@ struct llm_build_context { - - // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e - switch (model.type) { -+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; - case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; - case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; - default: GGML_ABORT("fatal error");