diff --git a/api/types.go b/api/types.go index 690b1cd1..c5223405 100644 --- a/api/types.go +++ b/api/types.go @@ -109,19 +109,19 @@ type Options struct { // Runner options which must be set when the model is loaded into memory type Runner struct { - UseNUMA bool `json:"numa,omitempty"` - NumCtx int `json:"num_ctx,omitempty"` - NumBatch int `json:"num_batch,omitempty"` - NumGQA int `json:"num_gqa,omitempty"` - NumGPU int `json:"num_gpu,omitempty"` - MainGPU int `json:"main_gpu,omitempty"` - LowVRAM bool `json:"low_vram,omitempty"` - F16KV bool `json:"f16_kv,omitempty"` - LogitsAll bool `json:"logits_all,omitempty"` - VocabOnly bool `json:"vocab_only,omitempty"` - UseMMap bool `json:"use_mmap,omitempty"` - UseMLock bool `json:"use_mlock,omitempty"` - NumThread int `json:"num_thread,omitempty"` + UseNUMA bool `json:"numa,omitempty"` + NumCtx int `json:"num_ctx,omitempty"` + NumBatch int `json:"num_batch,omitempty"` + NumGQA int `json:"num_gqa,omitempty"` + NumGPU int `json:"num_gpu,omitempty"` + MainGPU int `json:"main_gpu,omitempty"` + LowVRAM bool `json:"low_vram,omitempty"` + F16KV bool `json:"f16_kv,omitempty"` + LogitsAll bool `json:"logits_all,omitempty"` + VocabOnly bool `json:"vocab_only,omitempty"` + UseMMap bool `json:"use_mmap,omitempty"` + UseMLock bool `json:"use_mlock,omitempty"` + NumThread int `json:"num_thread,omitempty"` } type EmbeddingRequest struct { @@ -137,10 +137,11 @@ type EmbeddingResponse struct { } type CreateRequest struct { - Model string `json:"model"` - Path string `json:"path"` - Modelfile string `json:"modelfile"` - Stream *bool `json:"stream,omitempty"` + Model string `json:"model"` + Path string `json:"path"` + Modelfile string `json:"modelfile"` + Stream *bool `json:"stream,omitempty"` + Quantization string `json:"quantization,omitempty"` // Name is deprecated, see Model Name string `json:"name"` @@ -380,16 +381,16 @@ func DefaultOptions() Options { Runner: Runner{ // options set when the model is loaded - NumCtx: 2048, - NumBatch: 512, - NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically - NumGQA: 1, - NumThread: 0, // let the runtime decide - LowVRAM: false, - F16KV: true, - UseMLock: false, - UseMMap: true, - UseNUMA: false, + NumCtx: 2048, + NumBatch: 512, + NumGPU: -1, // -1 here indicates that NumGPU should be set dynamically + NumGQA: 1, + NumThread: 0, // let the runtime decide + LowVRAM: false, + F16KV: true, + UseMLock: false, + UseMMap: true, + UseNUMA: false, }, } } diff --git a/cmd/cmd.go b/cmd/cmd.go index cb8a162e..9f6c928c 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -194,7 +194,9 @@ func CreateHandler(cmd *cobra.Command, args []string) error { return nil } - request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile)} + quantization, _ := cmd.Flags().GetString("quantization") + + request := api.CreateRequest{Name: args[0], Modelfile: string(modelfile), Quantization: quantization} if err := client.Create(cmd.Context(), &request, fn); err != nil { return err } @@ -943,6 +945,7 @@ func NewCLI() *cobra.Command { } createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile (default \"Modelfile\")") + createCmd.Flags().StringP("quantization", "q", "", "Quantization level.") showCmd := &cobra.Command{ Use: "show MODEL", diff --git a/llm/llm.go b/llm/llm.go index 52c53ad2..33949c76 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -6,10 +6,81 @@ package llm // #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++ // #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++ // #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++ +// #include // #include "llama.h" import "C" +import ( + "fmt" + "unsafe" +) // SystemInfo is an unused example of calling llama.cpp functions using CGo func SystemInfo() string { return C.GoString(C.llama_print_system_info()) } + +func Quantize(infile, outfile, filetype string) error { + cinfile := C.CString(infile) + defer C.free(unsafe.Pointer(cinfile)) + + coutfile := C.CString(outfile) + defer C.free(unsafe.Pointer(coutfile)) + + params := C.llama_model_quantize_default_params() + params.nthread = -1 + + switch filetype { + case "F32": + params.ftype = fileTypeF32 + case "F16": + params.ftype = fileTypeF16 + case "Q4_0": + params.ftype = fileTypeQ4_0 + case "Q4_1": + params.ftype = fileTypeQ4_1 + case "Q4_1_F16": + params.ftype = fileTypeQ4_1_F16 + case "Q8_0": + params.ftype = fileTypeQ8_0 + case "Q5_0": + params.ftype = fileTypeQ5_0 + case "Q5_1": + params.ftype = fileTypeQ5_1 + case "Q2_K": + params.ftype = fileTypeQ2_K + case "Q3_K_S": + params.ftype = fileTypeQ3_K_S + case "Q3_K_M": + params.ftype = fileTypeQ3_K_M + case "Q3_K_L": + params.ftype = fileTypeQ3_K_L + case "Q4_K_S": + params.ftype = fileTypeQ4_K_S + case "Q4_K_M": + params.ftype = fileTypeQ4_K_M + case "Q5_K_S": + params.ftype = fileTypeQ5_K_S + case "Q5_K_M": + params.ftype = fileTypeQ5_K_M + case "Q6_K": + params.ftype = fileTypeQ6_K + case "IQ2_XXS": + params.ftype = fileTypeIQ2_XXS + case "IQ2_XS": + params.ftype = fileTypeIQ2_XS + case "Q2_K_S": + params.ftype = fileTypeQ2_K_S + case "Q3_K_XS": + params.ftype = fileTypeQ3_K_XS + case "IQ3_XXS": + params.ftype = fileTypeIQ3_XXS + default: + return fmt.Errorf("unknown filetype: %s", filetype) + } + + if retval := C.llama_model_quantize(cinfile, coutfile, ¶ms); retval != 0 { + return fmt.Errorf("llama_model_quantize: %d", retval) + } + + return nil +} diff --git a/server/images.go b/server/images.go index d729db55..0da51b85 100644 --- a/server/images.go +++ b/server/images.go @@ -284,7 +284,7 @@ func realpath(mfDir, from string) string { return abspath } -func CreateModel(ctx context.Context, name, modelFileDir string, commands []parser.Command, fn func(resp api.ProgressResponse)) error { +func CreateModel(ctx context.Context, name, modelFileDir, quantization string, commands []parser.Command, fn func(resp api.ProgressResponse)) error { deleteMap := make(map[string]struct{}) if manifest, _, err := GetManifest(ParseModelPath(name)); err == nil { for _, layer := range append(manifest.Layers, manifest.Config) { @@ -337,8 +337,27 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars if ggufName != "" { pathName = ggufName - slog.Debug(fmt.Sprintf("new image layer path: %s", pathName)) defer os.RemoveAll(ggufName) + + if quantization != "" { + quantization = strings.ToUpper(quantization) + fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", "F16", quantization)}) + tempfile, err := os.CreateTemp(filepath.Dir(ggufName), quantization) + if err != nil { + return err + } + defer os.RemoveAll(tempfile.Name()) + + if err := llm.Quantize(ggufName, tempfile.Name(), quantization); err != nil { + return err + } + + if err := tempfile.Close(); err != nil { + return err + } + + pathName = tempfile.Name() + } } bin, err := os.Open(pathName) diff --git a/server/routes.go b/server/routes.go index 8beee349..d1e7f4cd 100644 --- a/server/routes.go +++ b/server/routes.go @@ -647,7 +647,7 @@ func CreateModelHandler(c *gin.Context) { ctx, cancel := context.WithCancel(c.Request.Context()) defer cancel() - if err := CreateModel(ctx, model, filepath.Dir(req.Path), commands, fn); err != nil { + if err := CreateModel(ctx, model, filepath.Dir(req.Path), req.Quantization, commands, fn); err != nil { ch <- gin.H{"error": err.Error()} } }() diff --git a/server/routes_test.go b/server/routes_test.go index 7a9afe1e..4f907702 100644 --- a/server/routes_test.go +++ b/server/routes_test.go @@ -61,7 +61,7 @@ func Test_Routes(t *testing.T) { fn := func(resp api.ProgressResponse) { t.Logf("Status: %s", resp.Status) } - err = CreateModel(context.TODO(), name, "", commands, fn) + err = CreateModel(context.TODO(), name, "", "", commands, fn) assert.Nil(t, err) }