Merge pull request #6767 from ollama/jessegross/bug_6707
runner: Flush pending responses before returning
This commit is contained in:
commit
c354e87809
60
llm/ext_server/server.cpp
vendored
60
llm/ext_server/server.cpp
vendored
|
@ -913,7 +913,9 @@ struct llama_server_context
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
// search stop word and delete it
|
||||||
slot.generated_text += token_str;
|
if (!llama_token_is_eog(model, result.tok))
|
||||||
|
slot.generated_text += token_str;
|
||||||
|
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||||
|
@ -954,30 +956,36 @@ struct llama_server_context
|
||||||
if (!incomplete)
|
if (!incomplete)
|
||||||
{
|
{
|
||||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
|
||||||
bool is_stop_full = false;
|
|
||||||
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
|
||||||
if (stop_pos != std::string::npos)
|
|
||||||
{
|
|
||||||
is_stop_full = true;
|
|
||||||
slot.generated_text.erase(
|
|
||||||
slot.generated_text.begin() + pos + stop_pos,
|
|
||||||
slot.generated_text.end());
|
|
||||||
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
is_stop_full = false;
|
|
||||||
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if there is any token to predict
|
if (!llama_token_is_eog(model, result.tok)) {
|
||||||
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
{
|
bool is_stop_full = false;
|
||||||
// no send the stop word in the response
|
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
||||||
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
if (stop_pos != std::string::npos)
|
||||||
slot.n_sent_text += result.text_to_send.size();
|
{
|
||||||
// add the token to slot queue and cache
|
is_stop_full = true;
|
||||||
|
slot.generated_text.erase(
|
||||||
|
slot.generated_text.begin() + pos + stop_pos,
|
||||||
|
slot.generated_text.end());
|
||||||
|
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
is_stop_full = false;
|
||||||
|
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if there is any token to predict
|
||||||
|
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
|
||||||
|
{
|
||||||
|
// no send the stop word in the response
|
||||||
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
|
// add the token to slot queue and cache
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.params.stream)
|
if (slot.params.stream)
|
||||||
|
@ -1117,9 +1125,7 @@ struct llama_server_context
|
||||||
{"multimodal", multimodal}
|
{"multimodal", multimodal}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
res.result_json["content"] = tkn.text_to_send;
|
||||||
res.result_json["content"] = tkn.text_to_send;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.sparams.n_probs > 0)
|
if (slot.sparams.n_probs > 0)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue