parent
5314fc9b63
commit
c0960e29b5
31
llm/llama.go
31
llm/llama.go
|
@ -412,10 +412,6 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
|
||||||
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
port := rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
|
||||||
params := append(params, "--port", strconv.Itoa(port))
|
params := append(params, "--port", strconv.Itoa(port))
|
||||||
|
|
||||||
if runner.Type == "gguf" {
|
|
||||||
params = append(params, "--parallel", "2")
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
cmd := exec.CommandContext(
|
cmd := exec.CommandContext(
|
||||||
ctx,
|
ctx,
|
||||||
|
@ -549,6 +545,8 @@ type prediction struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
const maxBufferSize = 512 * format.KiloByte
|
const maxBufferSize = 512 * format.KiloByte
|
||||||
|
const maxRetries = 3
|
||||||
|
const retryDelay = 1 * time.Second
|
||||||
|
|
||||||
type PredictOpts struct {
|
type PredictOpts struct {
|
||||||
Prompt string
|
Prompt string
|
||||||
|
@ -570,6 +568,11 @@ type PredictResult struct {
|
||||||
EvalDuration time.Duration
|
EvalDuration time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsRetryable checks if the line matches a condition that can be retried
|
||||||
|
func isRetryable(line []byte) bool {
|
||||||
|
return bytes.Contains(line, []byte("slot unavailable"))
|
||||||
|
}
|
||||||
|
|
||||||
func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
|
func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
|
||||||
imageData := llm.ImageData
|
imageData := llm.ImageData
|
||||||
if len(predict.Images) > 0 {
|
if len(predict.Images) > 0 {
|
||||||
|
@ -607,6 +610,11 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
|
||||||
request["grammar"] = jsonGrammar
|
request["grammar"] = jsonGrammar
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for retries := 0; retries < maxRetries; retries++ {
|
||||||
|
if retries > 0 {
|
||||||
|
time.Sleep(retryDelay) // wait before retrying
|
||||||
|
}
|
||||||
|
|
||||||
// Handling JSON marshaling with special characters unescaped.
|
// Handling JSON marshaling with special characters unescaped.
|
||||||
buffer := &bytes.Buffer{}
|
buffer := &bytes.Buffer{}
|
||||||
enc := json.NewEncoder(buffer)
|
enc := json.NewEncoder(buffer)
|
||||||
|
@ -642,6 +650,8 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
|
||||||
// increase the buffer size to avoid running out of space
|
// increase the buffer size to avoid running out of space
|
||||||
buf := make([]byte, 0, maxBufferSize)
|
buf := make([]byte, 0, maxBufferSize)
|
||||||
scanner.Buffer(buf, maxBufferSize)
|
scanner.Buffer(buf, maxBufferSize)
|
||||||
|
|
||||||
|
retryNeeded := false
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -653,6 +663,11 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if isRetryable(line) {
|
||||||
|
retryNeeded = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||||
|
@ -698,7 +713,13 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
|
||||||
return fmt.Errorf("error reading llm response: %v", err)
|
return fmt.Errorf("error reading llm response: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
if !retryNeeded {
|
||||||
|
return nil // success
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// should never reach here ideally
|
||||||
|
return fmt.Errorf("max retries exceeded")
|
||||||
}
|
}
|
||||||
|
|
||||||
type TokenizeRequest struct {
|
type TokenizeRequest struct {
|
||||||
|
|
Loading…
Reference in a new issue