restore model load duration on generate response (#1524)

* restore model load duration on generate response - set model load duration on generate and chat done response - calculate createAt time when response created * remove checkpoints predict opts * Update routes.go
2023-12-14 12:15:50 -05:00 · 2023-12-14 12:15:50 -05:00 · 6ee8c80199
parent 31f0551dab
commit 6ee8c80199
2 changed files with 27 additions and 36 deletions
--- a/llm/llama.go
+++ b/llm/llama.go
@ -551,14 +551,9 @@ type PredictOpts struct {
 	Prompt string
 	Format string
 	Images []api.ImageData
 	CheckpointStart  time.Time
 	CheckpointLoaded time.Time
 }
 type PredictResult struct {
 	CreatedAt          time.Time
 	TotalDuration      time.Duration
 	LoadDuration       time.Duration
 	Content            string
 	Done               bool
 	PromptEvalCount    int
@ -681,16 +676,12 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 				if p.Content != "" {
 					fn(PredictResult{
 						CreatedAt: time.Now().UTC(),
 						Content: p.Content,
 					})
 				}
 				if p.Stop {
 					fn(PredictResult{
 						CreatedAt:     time.Now().UTC(),
 						TotalDuration: time.Since(predict.CheckpointStart),
 						Done:               true,
 						PromptEvalCount:    p.Timings.PromptN,
 						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
--- a/server/routes.go
+++ b/server/routes.go
@ -261,12 +261,10 @@ func GenerateHandler(c *gin.Context) {
 			resp := api.GenerateResponse{
 				Model:     req.Model,
-				CreatedAt: r.CreatedAt,
+				CreatedAt: time.Now().UTC(),
 				Done:      r.Done,
 				Response:  r.Content,
 				Metrics: api.Metrics{
 					TotalDuration:      r.TotalDuration,
 					LoadDuration:       r.LoadDuration,
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
 					EvalCount:          r.EvalCount,
@ -274,7 +272,11 @@ func GenerateHandler(c *gin.Context) {
 				},
 			}
-			if r.Done && !req.Raw {
+			if r.Done {
 				resp.TotalDuration = time.Since(checkpointStart)
 				resp.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 				if !req.Raw {
 					embd, err := loaded.runner.Encode(c.Request.Context(), prompt+generated.String())
 					if err != nil {
 						ch <- gin.H{"error": err.Error()}
@ -282,6 +284,7 @@ func GenerateHandler(c *gin.Context) {
 					}
 					resp.Context = embd
 				}
 			}
 			ch <- resp
 		}
@ -290,8 +293,6 @@ func GenerateHandler(c *gin.Context) {
 		predictReq := llm.PredictOpts{
 			Prompt: prompt,
 			Format: req.Format,
 			CheckpointStart:  checkpointStart,
 			CheckpointLoaded: checkpointLoaded,
 			Images: req.Images,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
@ -1012,11 +1013,9 @@ func ChatHandler(c *gin.Context) {
 			resp := api.ChatResponse{
 				Model:     req.Model,
-				CreatedAt: r.CreatedAt,
+				CreatedAt: time.Now().UTC(),
 				Done:      r.Done,
 				Metrics: api.Metrics{
 					TotalDuration:      r.TotalDuration,
 					LoadDuration:       r.LoadDuration,
 					PromptEvalCount:    r.PromptEvalCount,
 					PromptEvalDuration: r.PromptEvalDuration,
 					EvalCount:          r.EvalCount,
@ -1024,7 +1023,10 @@ func ChatHandler(c *gin.Context) {
 				},
 			}
-			if !r.Done {
+			if r.Done {
 				resp.TotalDuration = time.Since(checkpointStart)
 				resp.LoadDuration = checkpointLoaded.Sub(checkpointStart)
 			} else {
 				resp.Message = &api.Message{Role: "assistant", Content: r.Content}
 			}
@ -1035,8 +1037,6 @@ func ChatHandler(c *gin.Context) {
 		predictReq := llm.PredictOpts{
 			Prompt: prompt,
 			Format: req.Format,
 			CheckpointStart:  checkpointStart,
 			CheckpointLoaded: checkpointLoaded,
 			Images: images,
 		}
 		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {