diff --git a/api/types.go b/api/types.go index fcc27585..ffa5b7ca 100644 --- a/api/types.go +++ b/api/types.go @@ -38,6 +38,7 @@ type GenerateRequest struct { Context []int `json:"context,omitempty"` Stream *bool `json:"stream,omitempty"` Raw bool `json:"raw,omitempty"` + Format string `json:"format"` Options map[string]interface{} `json:"options"` } diff --git a/docs/api.md b/docs/api.md index 99c36bf4..e214ad7a 100644 --- a/docs/api.md +++ b/docs/api.md @@ -38,6 +38,7 @@ Generate a response for a given prompt with a provided model. This is a streamin - `model`: (required) the [model name](#model-names) - `prompt`: the prompt to generate a response for +- `format`: the format to return a response in. Currently the only accepted value is `json` Advanced parameters (optional): @@ -48,13 +49,17 @@ Advanced parameters (optional): - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects - `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself. +### JSON mode + +Enable JSON mode by setting the `format` parameter to `json` and specifying the model should use JSON in the `prompt`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below. + ### Examples #### Request ```shell curl -X POST http://localhost:11434/api/generate -d '{ - "model": "llama2:7b", + "model": "llama2", "prompt": "Why is the sky blue?" }' ``` @@ -65,7 +70,7 @@ A stream of JSON objects is returned: ```json { - "model": "llama2:7b", + "model": "llama2", "created_at": "2023-08-04T08:52:19.385406455-07:00", "response": "The", "done": false @@ -89,7 +94,7 @@ To calculate how fast the response is generated in tokens per second (token/s), ```json { - "model": "llama2:7b", + "model": "llama2", "created_at": "2023-08-04T19:22:45.499127Z", "response": "", "context": [1, 2, 3], @@ -105,7 +110,7 @@ To calculate how fast the response is generated in tokens per second (token/s), } ``` -#### Request +#### Request (No streaming) ```shell curl -X POST http://localhost:11434/api/generate -d '{ @@ -137,7 +142,7 @@ If `stream` is set to `false`, the response will be a single JSON object: } ``` -#### Request +#### Request (Raw mode) In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context. @@ -167,7 +172,54 @@ curl -X POST http://localhost:11434/api/generate -d '{ } ``` -#### Request +#### Request (JSON mode) + +```shell +curl -X POST http://localhost:11434/api/generate -d '{ + "model": "llama2", + "prompt": "What color is the sky at different times of the day? Respond using JSON", + "format": "json", + "stream": false +}' +``` + +#### Response + +```json +{ + "model": "llama2", + "created_at": "2023-11-09T21:07:55.186497Z", + "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n", + "done": true, + "total_duration": 4661289125, + "load_duration": 1714434500, + "prompt_eval_count": 36, + "prompt_eval_duration": 264132000, + "eval_count": 75, + "eval_duration": 2112149000 +} +``` + +The value of `response` will be a string containing JSON similar to: + +```json +{ + "morning": { + "color": "blue" + }, + "noon": { + "color": "blue-gray" + }, + "afternoon": { + "color": "warm gray" + }, + "evening": { + "color": "orange" + } +} +``` + +#### Request (With options) If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override. diff --git a/llm/llama.go b/llm/llama.go index 903c5f74..d8859393 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -27,6 +27,34 @@ import ( "github.com/jmorganca/ollama/format" ) +const jsonGrammar = ` +root ::= object +value ::= object | array | string | number | ("true" | "false" | "null") ws + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" ws + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" ws + +string ::= + "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes + )* "\"" ws + +number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws + +# Optional space: by convention, applied in this grammar after literal chars when allowed +ws ::= ([ \t\n] ws)? +` + //go:embed llama.cpp/*/build/*/bin/* var llamaCppEmbed embed.FS @@ -497,7 +525,7 @@ type prediction struct { const maxBufferSize = 512 * format.KiloByte -func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error { +func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, format string, fn func(api.GenerateResponse)) error { prevConvo, err := llm.Decode(ctx, prevContext) if err != nil { return err @@ -532,6 +560,10 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, "stop": llm.Stop, } + if format == "json" { + request["grammar"] = jsonGrammar + } + // Handling JSON marshaling with special characters unescaped. buffer := &bytes.Buffer{} enc := json.NewEncoder(buffer) diff --git a/llm/llm.go b/llm/llm.go index 34017dad..22706da5 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -14,7 +14,7 @@ import ( ) type LLM interface { - Predict(context.Context, []int, string, func(api.GenerateResponse)) error + Predict(context.Context, []int, string, string, func(api.GenerateResponse)) error Embedding(context.Context, string) ([]float64, error) Encode(context.Context, string) ([]int, error) Decode(context.Context, []int) (string, error) diff --git a/server/routes.go b/server/routes.go index 9884afbf..a543b10e 100644 --- a/server/routes.go +++ b/server/routes.go @@ -163,6 +163,9 @@ func GenerateHandler(c *gin.Context) { case req.Model == "": c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"}) return + case len(req.Format) > 0 && req.Format != "json": + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"}) + return case req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0): c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"}) return @@ -231,7 +234,7 @@ func GenerateHandler(c *gin.Context) { ch <- r } - if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil { + if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, req.Format, fn); err != nil { ch <- gin.H{"error": err.Error()} } }()