From 1b0e6c9c0e5d53aa6110530da0befab7c95d1755 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 5 May 2024 20:50:31 -0700 Subject: [PATCH] Fix llava models not working after first request (#4164) * fix llava models not working after first request * individual requests only for llava models --- llm/patches/05-clip-fix.diff | 24 ++++++++++++++++++++++++ llm/server.go | 9 ++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 llm/patches/05-clip-fix.diff diff --git a/llm/patches/05-clip-fix.diff b/llm/patches/05-clip-fix.diff new file mode 100644 index 00000000..3f68a5bb --- /dev/null +++ b/llm/patches/05-clip-fix.diff @@ -0,0 +1,24 @@ +diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp +index e3c9bcd4..b43f892d 100644 +--- a/examples/llava/clip.cpp ++++ b/examples/llava/clip.cpp +@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 + struct ggml_tensor * embeddings = inp; + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); ++ } ++ ggml_set_name(embeddings, "embeddings"); ++ ggml_set_input(embeddings); ++ ++ if (ctx->has_class_embedding) { + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } +- ggml_set_name(embeddings, "embeddings"); +- ggml_set_input(embeddings); +- + + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); + ggml_set_name(positions, "positions"); diff --git a/llm/server.go b/llm/server.go index 2272ac83..44bada08 100644 --- a/llm/server.go +++ b/llm/server.go @@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--numa") } - // "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests numParallel := envconfig.NumParallel + + // TODO (jmorganca): multimodal models don't support parallel yet + // see https://github.com/ollama/ollama/issues/4165 + if len(projectors) > 0 { + numParallel = 1 + slog.Warn("multimodal models don't support parallel requests yet") + } + params = append(params, "--parallel", fmt.Sprintf("%d", numParallel)) for i := 0; i < len(servers); i++ {