diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 14d921c0..0d51460c 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -2438,15 +2438,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; } - else if (arg == "--lora-base") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { server_verbose = true; diff --git a/llm/llama.cpp b/llm/llama.cpp index d94c6e0c..6eeaeba1 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa +Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972 diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 646bc49c..0d40fc3c 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 8fe51971..7113ba64 100644 +index a207451f..2ddf431d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5433,16 +5433,7 @@ static void llm_load_vocab( +@@ -5347,16 +5347,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = true; @@ -20,9 +20,9 @@ index 8fe51971..7113ba64 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5526,7 +5517,8 @@ static void llm_load_vocab( - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; - vocab.tokenizer_clean_spaces = false; +@@ -5443,7 +5434,8 @@ static void llm_load_vocab( + tokenizer_pre == "codeshell") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff index fc1017a6..10c66d1d 100644 --- a/llm/patches/09-lora.diff +++ b/llm/patches/09-lora.diff @@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp index dbb724fb..c26fe6ee 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2087,14 +2087,29 @@ std::tuple llama_init_from_gpt_par +@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -20,9 +20,7 @@ index dbb724fb..c26fe6ee 100644 + int err = llama_model_apply_lora_from_file(model, + lora_adapter.c_str(), + lora_scale, -+ ((i > 0) || params.lora_base.empty()) -+ ? NULL -+ : params.lora_base.c_str(), ++ nullptr, + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); diff --git a/llm/patches/10-llama3-rope.diff b/llm/patches/10-llama3-rope.diff deleted file mode 100644 index 39f38fea..00000000 --- a/llm/patches/10-llama3-rope.diff +++ /dev/null @@ -1,70 +0,0 @@ -From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001 -From: Michael Yang -Date: Tue, 23 Jul 2024 14:33:29 -0700 -Subject: [PATCH] llama 3.1 rope scaling - ---- - src/llama.cpp | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/src/llama.cpp b/src/llama.cpp -index 8fe51971..a9969df8 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -2472,6 +2472,7 @@ struct llama_layer { - // long rope factors - struct ggml_tensor * rope_long = nullptr; - struct ggml_tensor * rope_short = nullptr; -+ struct ggml_tensor * rope_freqs = nullptr; - - // bitnet scale - struct ggml_tensor * wq_scale; -@@ -6143,6 +6144,8 @@ static bool llm_load_tensors( - - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - -+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); -+ - if (n_expert == 0) { - layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); -@@ -8620,6 +8623,10 @@ struct llm_build_context { - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - -+ if (model.layers[il].rope_freqs != nullptr) { -+ return model.layers[il].rope_freqs; -+ } -+ - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } -@@ -8814,6 +8821,9 @@ struct llm_build_context { - - // self-attention - { -+ // rope freq factors for llama3; may return nullptr for llama2 and other models -+ struct ggml_tensor * rope_factors = build_rope_factors(il); -+ - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); -@@ -8837,14 +8847,14 @@ struct llm_build_context { - } - - Qcur = ggml_rope_ext( -- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, -+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( -- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, -+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); --- -2.45.2