diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index e4bc872c..d66ba9f0 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -19,17 +19,19 @@ func TestMultiModelConcurrency(t *testing.T) { var ( req = [2]api.GenerateRequest{ { - Model: "orca-mini", - Prompt: "why is the ocean blue?", - Stream: &stream, + Model: "orca-mini", + Prompt: "why is the ocean blue?", + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "seed": 42, "temperature": 0.0, }, }, { - Model: "tinydolphin", - Prompt: "what is the origin of the us thanksgiving holiday?", - Stream: &stream, + Model: "tinydolphin", + Prompt: "what is the origin of the us thanksgiving holiday?", + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "seed": 42, "temperature": 0.0, @@ -43,7 +45,7 @@ func TestMultiModelConcurrency(t *testing.T) { ) var wg sync.WaitGroup wg.Add(len(req)) - ctx, cancel := context.WithTimeout(context.Background(), time.Second*120) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*240) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) @@ -56,32 +58,46 @@ func TestMultiModelConcurrency(t *testing.T) { for i := 0; i < len(req); i++ { go func(i int) { defer wg.Done() - DoGenerate(ctx, t, client, req[i], resp[i], 30*time.Second, 10*time.Second) + DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second) }(i) } wg.Wait() } func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) // GTX 750 2G card takes ~9 minutes + req, resp := GenerateRequests() + reqLimit := len(req) + iterLimit := 5 + + vram := os.Getenv("OLLAMA_MAX_VRAM") + if vram != "" { + max, err := strconv.ParseUint(vram, 10, 64) + require.NoError(t, err) + // Don't hammer on small VRAM cards... + if max < 4*1024*1024*1024 { + reqLimit = min(reqLimit, 2) + iterLimit = 2 + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute) defer cancel() client, _, cleanup := InitServerConnection(ctx, t) defer cleanup() - req, resp := GenerateRequests() // Get the server running (if applicable) warm the model up with a single initial request - DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 5*time.Second) + DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second) var wg sync.WaitGroup - wg.Add(len(req)) - for i := 0; i < len(req); i++ { + wg.Add(reqLimit) + for i := 0; i < reqLimit; i++ { go func(i int) { defer wg.Done() - for j := 0; j < 5; j++ { + for j := 0; j < iterLimit; j++ { slog.Info("Starting", "req", i, "iter", j) - // On slower GPUs it can take a while to process the 4 concurrent requests + // On slower GPUs it can take a while to process the concurrent requests // so we allow a much longer initial timeout - DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 5*time.Second) + DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second) } }(i) } diff --git a/integration/context_test.go b/integration/context_test.go index 75efb435..025b803d 100644 --- a/integration/context_test.go +++ b/integration/context_test.go @@ -11,7 +11,7 @@ import ( ) func TestContextExhaustion(t *testing.T) { - ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs + ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) // Longer needed for small footprint GPUs defer cancel() // Set up the test data req := api.GenerateRequest{ diff --git a/integration/llm_image_test.go b/integration/llm_image_test.go index 77319aef..d0c861cc 100644 --- a/integration/llm_image_test.go +++ b/integration/llm_image_test.go @@ -32,7 +32,11 @@ func TestIntegrationMultimodal(t *testing.T) { resp := "the ollam" ctx, cancel := context.WithTimeout(context.Background(), 3*time.Minute) defer cancel() - GenerateTestHelper(ctx, t, req, []string{resp}) + client, _, cleanup := InitServerConnection(ctx, t) + defer cleanup() + require.NoError(t, PullIfMissing(ctx, client, req.Model)) + // llava models on CPU can be quite slow to start, + DoGenerate(ctx, t, client, req, []string{resp}, 120*time.Second, 30*time.Second) } const imageEncoding = `iVBORw0KGgoAAAANSUhEUgAAANIAAAB4CAYAAACHHqzKAAAAAXNSR0IArs4c6QAAAIRlWElmTU0AKgAAAAgABQESAAMAAAABAAEAAAEaAAUAAAABAAAASgEb diff --git a/integration/utils_test.go b/integration/utils_test.go index 5da6fc72..7e1fcc10 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -140,7 +140,7 @@ func PullIfMissing(ctx context.Context, client *api.Client, modelName string) er showCtx, cancel := context.WithDeadlineCause( ctx, - time.Now().Add(5*time.Second), + time.Now().Add(10*time.Second), fmt.Errorf("show for existing model %s took too long", modelName), ) defer cancel() @@ -287,41 +287,46 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap func GenerateRequests() ([]api.GenerateRequest, [][]string) { return []api.GenerateRequest{ { - Model: "orca-mini", - Prompt: "why is the ocean blue?", - Stream: &stream, + Model: "orca-mini", + Prompt: "why is the ocean blue?", + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "seed": 42, "temperature": 0.0, }, }, { - Model: "orca-mini", - Prompt: "why is the color of dirt brown?", - Stream: &stream, + Model: "orca-mini", + Prompt: "why is the color of dirt brown?", + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "seed": 42, "temperature": 0.0, }, }, { - Model: "orca-mini", - Prompt: "what is the origin of the us thanksgiving holiday?", - Stream: &stream, + Model: "orca-mini", + Prompt: "what is the origin of the us thanksgiving holiday?", + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "seed": 42, "temperature": 0.0, }, }, { - Model: "orca-mini", - Prompt: "what is the origin of independence day?", - Stream: &stream, + Model: "orca-mini", + Prompt: "what is the origin of independence day?", + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "seed": 42, "temperature": 0.0, }, }, { - Model: "orca-mini", - Prompt: "what is the composition of air?", - Stream: &stream, + Model: "orca-mini", + Prompt: "what is the composition of air?", + Stream: &stream, + KeepAlive: &api.Duration{Duration: 10 * time.Second}, Options: map[string]interface{}{ "seed": 42, "temperature": 0.0,