Fix llava models not working after first request (#4164)
* fix llava models not working after first request * individual requests only for llava models
This commit is contained in:
parent
dfa2f32ca0
commit
1b0e6c9c0e
24
llm/patches/05-clip-fix.diff
Normal file
24
llm/patches/05-clip-fix.diff
Normal file
|
@ -0,0 +1,24 @@
|
|||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index e3c9bcd4..b43f892d 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
struct ggml_tensor * embeddings = inp;
|
||||
if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
+ }
|
||||
+ ggml_set_name(embeddings, "embeddings");
|
||||
+ ggml_set_input(embeddings);
|
||||
+
|
||||
+ if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
- ggml_set_name(embeddings, "embeddings");
|
||||
- ggml_set_input(embeddings);
|
||||
-
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_set_name(positions, "positions");
|
|
@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
params = append(params, "--numa")
|
||||
}
|
||||
|
||||
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
|
||||
numParallel := envconfig.NumParallel
|
||||
|
||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||
// see https://github.com/ollama/ollama/issues/4165
|
||||
if len(projectors) > 0 {
|
||||
numParallel = 1
|
||||
slog.Warn("multimodal models don't support parallel requests yet")
|
||||
}
|
||||
|
||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||
|
||||
for i := 0; i < len(servers); i++ {
|
||||
|
|
Loading…
Reference in a new issue