From d338d70492779268ecbd6264528213fde219b11d Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 13 Mar 2024 11:03:56 -0700 Subject: [PATCH] refactor model parsing --- llm/ggla.go | 62 ++++++---------------------- llm/ggml.go | 100 +++++++++++++++++++++++++++++++++----------- llm/gguf.go | 105 ++++++++--------------------------------------- llm/llm.go | 18 ++++---- server/images.go | 43 ++++++++----------- 5 files changed, 131 insertions(+), 197 deletions(-) diff --git a/llm/ggla.go b/llm/ggla.go index 7b9169e7..591d5e00 100644 --- a/llm/ggla.go +++ b/llm/ggla.go @@ -35,7 +35,7 @@ type ggla struct { *containerGGLA kv KV - tensors []Tensor + tensors []*Tensor } func newGGLA(container *containerGGLA) *ggla { @@ -45,18 +45,26 @@ func newGGLA(container *containerGGLA) *ggla { } } -func (m *ggla) decode(rs io.ReadSeeker) error { +func (llm *ggla) KV() KV { + return llm.kv +} + +func (llm *ggla) Tensors() []*Tensor { + return llm.tensors +} + +func (llm *ggla) decode(rs io.ReadSeeker) error { var r uint32 if err := binary.Read(rs, binary.LittleEndian, &r); err != nil { return err } - m.kv["r"] = r + llm.kv["r"] = r var alpha uint32 if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil { return err } - m.kv["alpha"] = alpha + llm.kv["alpha"] = alpha for { var dims uint32 @@ -115,50 +123,6 @@ func (m *ggla) decode(rs io.ReadSeeker) error { return err } - m.tensors = append(m.tensors, t) + llm.tensors = append(llm.tensors, &t) } } - -func (m *ggla) KV() KV { - return m.kv -} - -func (m *ggla) Tensor() []Tensor { - return m.tensors -} - -func (*ggla) ModelFamily() string { - return "ggla" -} - -func (*ggla) ModelType() string { - panic("not implemented") -} - -func (*ggla) FileType() string { - panic("not implemented") -} - -func (*ggla) NumLayers() uint32 { - panic("not implemented") -} - -func (*ggla) NumGQA() uint32 { - panic("not implemented") -} - -func (*ggla) NumEmbed() uint32 { - panic("not implemented") -} - -func (*ggla) NumHead() uint32 { - panic("not implemented") -} - -func (*ggla) NumHeadKv() uint32 { - panic("not implemented") -} - -func (*ggla) NumCtx() uint32 { - panic("not implemented") -} diff --git a/llm/ggml.go b/llm/ggml.go index 1e81d03b..98a42298 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -3,14 +3,13 @@ package llm import ( "encoding/binary" "errors" + "fmt" "io" ) type GGML struct { container model - - Size int64 } const ( @@ -90,28 +89,82 @@ func fileType(fileType uint32) string { } type model interface { - ModelFamily() string - ModelType() string - FileType() string - NumLayers() uint32 - NumGQA() uint32 - NumEmbed() uint32 - NumHead() uint32 - NumHeadKv() uint32 - NumCtx() uint32 + KV() KV + Tensors() []*Tensor } type KV map[string]any +func (kv KV) u64(key string) uint64 { + switch v := kv[key].(type) { + case uint64: + return v + case uint32: + return uint64(v) + case float64: + return uint64(v) + default: + return 0 + } +} + +func (kv KV) Architecture() string { + if s, ok := kv["general.architecture"].(string); ok { + return s + } + + return "unknown" +} + +func (kv KV) ParameterCount() uint64 { + return kv.u64("general.parameter_count") +} + +func (kv KV) FileType() string { + if u64 := kv.u64("general.file_type"); u64 > 0 { + return fileType(uint32(u64)) + } + + return "unknown" +} + +func (kv KV) BlockCount() uint64 { + return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture())) +} + +func (kv KV) HeadCount() uint64 { + return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture())) +} + +func (kv KV) HeadCountKV() uint64 { + return kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())) +} + +func (kv KV) GQA() uint64 { + if headCountKV := kv.HeadCountKV(); headCountKV > 0 { + return kv.HeadCount() / headCountKV + } + + return 0 +} + +func (kv KV) EmbeddingLength() uint64 { + return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture())) +} + +func (kv KV) ContextLength() uint64 { + return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture())) +} + type Tensor struct { - Name string - Kind uint32 - Offset uint64 + Name string `json:"name"` + Kind uint32 `json:"kind"` + Offset uint64 `json:"-"` // Shape is the number of elements in each dimension - Shape []uint64 + Shape []uint64 `json:"shape"` - io.WriterTo + io.WriterTo `json:"-"` } func (t Tensor) blockSize() uint64 { @@ -201,16 +254,16 @@ const ( var ErrUnsupportedFormat = errors.New("unsupported model format") -func DecodeGGML(rs io.ReadSeeker) (*GGML, error) { +func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { var magic uint32 if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { - return nil, err + return nil, 0, err } var c container switch magic { case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: - return nil, ErrUnsupportedFormat + return nil, 0, ErrUnsupportedFormat case FILE_MAGIC_GGLA: c = &containerGGLA{} case FILE_MAGIC_GGUF_LE: @@ -218,25 +271,24 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, error) { case FILE_MAGIC_GGUF_BE: c = &containerGGUF{ByteOrder: binary.BigEndian} default: - return nil, errors.New("invalid file magic") + return nil, 0, errors.New("invalid file magic") } model, err := c.Decode(rs) if errors.Is(err, io.EOF) { // noop } else if err != nil { - return nil, err + return nil, 0, err } offset, err := rs.Seek(0, io.SeekCurrent) if err != nil { - return nil, err + return nil, 0, err } // final model type return &GGML{ container: c, model: model, - Size: offset, - }, nil + }, offset, nil } diff --git a/llm/gguf.go b/llm/gguf.go index 1b270694..7d804712 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -6,8 +6,6 @@ import ( "fmt" "io" "strings" - - "github.com/ollama/ollama/format" ) type containerGGUF struct { @@ -90,8 +88,8 @@ const ( type gguf struct { *containerGGUF - KV - Tensors []Tensor + kv KV + tensors []*Tensor parameters uint64 } @@ -99,7 +97,7 @@ type gguf struct { func newGGUF(container *containerGGUF) *gguf { return &gguf{ containerGGUF: container, - KV: make(KV), + kv: make(KV), } } @@ -107,6 +105,14 @@ func NewGGUFV3(bo binary.ByteOrder) *gguf { return newGGUF(&containerGGUF{ByteOrder: bo, Version: 3}) } +func (llm *gguf) KV() KV { + return llm.kv +} + +func (llm *gguf) Tensors() []*Tensor { + return llm.tensors +} + func (llm *gguf) numTensor() uint64 { switch llm.Version { case 1: @@ -129,30 +135,6 @@ func (llm *gguf) numKV() uint64 { } } -func (llm *gguf) ModelFamily() string { - if t, ok := llm.KV["general.architecture"].(string); ok { - return t - } - - return "unknown" -} - -func (llm *gguf) ModelType() string { - if llm.parameters > 0 { - return format.HumanNumber(llm.parameters) - } - - return "unknown" -} - -func (llm *gguf) FileType() string { - if t, ok := llm.KV["general.file_type"].(uint32); ok { - return fileType(t) - } - - return "unknown" -} - func (llm *gguf) Decode(rs io.ReadSeeker) error { // decode key-values for i := 0; uint64(i) < llm.numKV(); i++ { @@ -202,7 +184,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { return err } - llm.KV[k] = v + llm.kv[k] = v } // decode tensors @@ -243,11 +225,14 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { Shape: shape[:], } - llm.Tensors = append(llm.Tensors, tensor) + llm.tensors = append(llm.tensors, &tensor) llm.parameters += tensor.parameters() } - alignment, ok := llm.KV["general.alignment"].(uint32) + // patch KV with parameter count + llm.kv["general.parameter_count"] = llm.parameters + + alignment, ok := llm.kv["general.alignment"].(uint32) if !ok { alignment = 32 } @@ -262,7 +247,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { return err } - for _, tensor := range llm.Tensors { + for _, tensor := range llm.tensors { padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1) if _, err := rs.Seek(padded, io.SeekCurrent); err != nil { return err @@ -272,60 +257,6 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error { return nil } -func (llm *gguf) NumLayers() uint32 { - value, exists := llm.KV[fmt.Sprintf("%s.block_count", llm.ModelFamily())] - if !exists { - return 0 - } - - return value.(uint32) -} - -func (llm *gguf) NumHead() uint32 { - value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())] - if !exists { - return 0 - } - - return value.(uint32) -} - -func (llm *gguf) NumEmbed() uint32 { - value, exists := llm.KV[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())] - if !exists { - return 0 - } - - return value.(uint32) -} - -func (llm *gguf) NumHeadKv() uint32 { - value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())] - if !exists { - return 0 - } - - return value.(uint32) -} - -func (llm *gguf) NumCtx() uint32 { - value, exists := llm.KV[fmt.Sprintf("%s.context_length", llm.ModelFamily())] - if !exists { - return 0 - } - - return value.(uint32) -} - -func (llm *gguf) NumGQA() uint32 { - numHeadKv := llm.NumHeadKv() - if numHeadKv == 0 { - return 0 - } - - return llm.NumHead() / numHeadKv -} - func readGGUF[T any](llm *gguf, r io.Reader) (T, error) { var t T err := binary.Read(r, llm.ByteOrder, &t) diff --git a/llm/llm.go b/llm/llm.go index 75ff955e..044e9842 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -35,14 +35,14 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er } defer f.Close() - ggml, err := DecodeGGML(f) + ggml, size, err := DecodeGGML(f) if err != nil { return nil, err } - if opts.NumCtx > int(ggml.NumCtx()) { - slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx())) - opts.NumCtx = int(ggml.NumCtx()) + if opts.NumCtx > int(ggml.KV().ContextLength()) { + slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength()) + opts.NumCtx = int(ggml.KV().ContextLength()) } if opts.NumCtx < 4 { @@ -50,18 +50,16 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er } vram, _ := gpu.CheckVRAM() - size := ggml.Size // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value - kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1)) + kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount()) // this amount is the overhead + tensors in memory // TODO: get this from the llama.cpp's graph calculations instead of // estimating it's 1/6 * kv_cache_size * num_gqa - graph := int64(ggml.NumGQA()) * kv / 6 + graph := int64(ggml.KV().GQA()) * kv / 6 - // certain model architectures don't support gpu inference yet - if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) { + if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) { opts.NumGPU = 0 } @@ -105,7 +103,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er // 2. the proportional kv cache for all devices (kv * % layers) // 3. the proportional model (size * % layers / # devices) // This estimates the number of layers - maxlayers := int64(ggml.NumLayers()) + 1 + maxlayers := int64(ggml.KV().BlockCount()) + 1 devices := int64(info.DeviceCount) avg := vram / devices layers := maxlayers * (avg - graph) / (kv + size/devices) diff --git a/server/images.go b/server/images.go index 1f68ff9c..7410b09c 100644 --- a/server/images.go +++ b/server/images.go @@ -26,6 +26,7 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/convert" + "github.com/ollama/ollama/format" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/parser" "github.com/ollama/ollama/version" @@ -420,37 +421,32 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars defer bin.Close() var offset int64 - CREATE: for { fn(api.ProgressResponse{Status: "creating model layer"}) if _, err := bin.Seek(offset, io.SeekStart); err != nil { return err } - ggml, err := llm.DecodeGGML(bin) - if err != nil { - slog.Error(fmt.Sprintf("error decoding gguf file: %q", err)) - switch { - case errors.Is(err, io.EOF): - break CREATE - case errors.Is(err, llm.ErrUnsupportedFormat): - return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err) - default: - return err - } + ggml, size, err := llm.DecodeGGML(bin) + if errors.Is(err, io.EOF) { + break + } else if errors.Is(err, llm.ErrUnsupportedFormat) { + return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err) + } else if err != nil { + return err } config.SetModelFormat(ggml.Name()) - config.SetModelFamily(ggml.ModelFamily()) - config.SetModelType(ggml.ModelType()) - config.SetFileType(ggml.FileType()) + config.SetModelFamily(ggml.KV().Architecture()) + config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount())) + config.SetFileType(ggml.KV().FileType()) mediatype := mediatype - if ggml.ModelFamily() == "clip" { + if ggml.KV().Architecture() == "clip" { mediatype = "application/vnd.ollama.image.projector" } - sr := io.NewSectionReader(bin, offset, ggml.Size) + sr := io.NewSectionReader(bin, offset, size) layer, err := NewLayer(sr, mediatype) if err != nil { return err @@ -458,7 +454,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars layers.Add(layer) - offset += ggml.Size + offset += size } case "adapter": if strings.HasPrefix(c.Args, "@") { @@ -477,12 +473,12 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars } defer bin.Close() - ggml, err := llm.DecodeGGML(bin) + _, size, err := llm.DecodeGGML(bin) if err != nil { return err } - sr := io.NewSectionReader(bin, 0, ggml.Size) + sr := io.NewSectionReader(bin, 0, size) layer, err := NewLayer(sr, mediatype) if err != nil { return err @@ -554,13 +550,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars } } - // xxx - can this be removed? - if config.ModelType == "65B" { - if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 { - config.ModelType = "70B" - } - } - var b bytes.Buffer if err := json.NewEncoder(&b).Encode(formattedParams); err != nil { return err