Fix llava models not working after first request (#4164)
* fix llava models not working after first request * individual requests only for llava models
This commit is contained in:
parent
dfa2f32ca0
commit
1b0e6c9c0e
24
llm/patches/05-clip-fix.diff
Normal file
24
llm/patches/05-clip-fix.diff
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||||
|
index e3c9bcd4..b43f892d 100644
|
||||||
|
--- a/examples/llava/clip.cpp
|
||||||
|
+++ b/examples/llava/clip.cpp
|
||||||
|
@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
|
struct ggml_tensor * embeddings = inp;
|
||||||
|
if (ctx->has_class_embedding) {
|
||||||
|
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||||
|
+ }
|
||||||
|
+ ggml_set_name(embeddings, "embeddings");
|
||||||
|
+ ggml_set_input(embeddings);
|
||||||
|
+
|
||||||
|
+ if (ctx->has_class_embedding) {
|
||||||
|
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||||
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||||
|
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||||
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||||
|
}
|
||||||
|
- ggml_set_name(embeddings, "embeddings");
|
||||||
|
- ggml_set_input(embeddings);
|
||||||
|
-
|
||||||
|
|
||||||
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||||
|
ggml_set_name(positions, "positions");
|
|
@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
params = append(params, "--numa")
|
params = append(params, "--numa")
|
||||||
}
|
}
|
||||||
|
|
||||||
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
|
|
||||||
numParallel := envconfig.NumParallel
|
numParallel := envconfig.NumParallel
|
||||||
|
|
||||||
|
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||||
|
// see https://github.com/ollama/ollama/issues/4165
|
||||||
|
if len(projectors) > 0 {
|
||||||
|
numParallel = 1
|
||||||
|
slog.Warn("multimodal models don't support parallel requests yet")
|
||||||
|
}
|
||||||
|
|
||||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||||
|
|
||||||
for i := 0; i < len(servers); i++ {
|
for i := 0; i < len(servers); i++ {
|
||||||
|
|
Loading…
Reference in a new issue