refactor model parsing

This commit is contained in:
Michael Yang 2024-03-13 11:03:56 -07:00
parent 011bb67351
commit d338d70492
5 changed files with 131 additions and 197 deletions

View file

@ -35,7 +35,7 @@ type ggla struct {
*containerGGLA *containerGGLA
kv KV kv KV
tensors []Tensor tensors []*Tensor
} }
func newGGLA(container *containerGGLA) *ggla { func newGGLA(container *containerGGLA) *ggla {
@ -45,18 +45,26 @@ func newGGLA(container *containerGGLA) *ggla {
} }
} }
func (m *ggla) decode(rs io.ReadSeeker) error { func (llm *ggla) KV() KV {
return llm.kv
}
func (llm *ggla) Tensors() []*Tensor {
return llm.tensors
}
func (llm *ggla) decode(rs io.ReadSeeker) error {
var r uint32 var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil { if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err return err
} }
m.kv["r"] = r llm.kv["r"] = r
var alpha uint32 var alpha uint32
if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil { if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
return err return err
} }
m.kv["alpha"] = alpha llm.kv["alpha"] = alpha
for { for {
var dims uint32 var dims uint32
@ -115,50 +123,6 @@ func (m *ggla) decode(rs io.ReadSeeker) error {
return err return err
} }
m.tensors = append(m.tensors, t) llm.tensors = append(llm.tensors, &t)
} }
} }
func (m *ggla) KV() KV {
return m.kv
}
func (m *ggla) Tensor() []Tensor {
return m.tensors
}
func (*ggla) ModelFamily() string {
return "ggla"
}
func (*ggla) ModelType() string {
panic("not implemented")
}
func (*ggla) FileType() string {
panic("not implemented")
}
func (*ggla) NumLayers() uint32 {
panic("not implemented")
}
func (*ggla) NumGQA() uint32 {
panic("not implemented")
}
func (*ggla) NumEmbed() uint32 {
panic("not implemented")
}
func (*ggla) NumHead() uint32 {
panic("not implemented")
}
func (*ggla) NumHeadKv() uint32 {
panic("not implemented")
}
func (*ggla) NumCtx() uint32 {
panic("not implemented")
}

View file

@ -3,14 +3,13 @@ package llm
import ( import (
"encoding/binary" "encoding/binary"
"errors" "errors"
"fmt"
"io" "io"
) )
type GGML struct { type GGML struct {
container container
model model
Size int64
} }
const ( const (
@ -90,28 +89,82 @@ func fileType(fileType uint32) string {
} }
type model interface { type model interface {
ModelFamily() string KV() KV
ModelType() string Tensors() []*Tensor
FileType() string
NumLayers() uint32
NumGQA() uint32
NumEmbed() uint32
NumHead() uint32
NumHeadKv() uint32
NumCtx() uint32
} }
type KV map[string]any type KV map[string]any
func (kv KV) u64(key string) uint64 {
switch v := kv[key].(type) {
case uint64:
return v
case uint32:
return uint64(v)
case float64:
return uint64(v)
default:
return 0
}
}
func (kv KV) Architecture() string {
if s, ok := kv["general.architecture"].(string); ok {
return s
}
return "unknown"
}
func (kv KV) ParameterCount() uint64 {
return kv.u64("general.parameter_count")
}
func (kv KV) FileType() string {
if u64 := kv.u64("general.file_type"); u64 > 0 {
return fileType(uint32(u64))
}
return "unknown"
}
func (kv KV) BlockCount() uint64 {
return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
}
func (kv KV) HeadCount() uint64 {
return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
}
func (kv KV) HeadCountKV() uint64 {
return kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture()))
}
func (kv KV) GQA() uint64 {
if headCountKV := kv.HeadCountKV(); headCountKV > 0 {
return kv.HeadCount() / headCountKV
}
return 0
}
func (kv KV) EmbeddingLength() uint64 {
return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
}
func (kv KV) ContextLength() uint64 {
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
}
type Tensor struct { type Tensor struct {
Name string Name string `json:"name"`
Kind uint32 Kind uint32 `json:"kind"`
Offset uint64 Offset uint64 `json:"-"`
// Shape is the number of elements in each dimension // Shape is the number of elements in each dimension
Shape []uint64 Shape []uint64 `json:"shape"`
io.WriterTo io.WriterTo `json:"-"`
} }
func (t Tensor) blockSize() uint64 { func (t Tensor) blockSize() uint64 {
@ -201,16 +254,16 @@ const (
var ErrUnsupportedFormat = errors.New("unsupported model format") var ErrUnsupportedFormat = errors.New("unsupported model format")
func DecodeGGML(rs io.ReadSeeker) (*GGML, error) { func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
var magic uint32 var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil { if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, err return nil, 0, err
} }
var c container var c container
switch magic { switch magic {
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
return nil, ErrUnsupportedFormat return nil, 0, ErrUnsupportedFormat
case FILE_MAGIC_GGLA: case FILE_MAGIC_GGLA:
c = &containerGGLA{} c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE: case FILE_MAGIC_GGUF_LE:
@ -218,25 +271,24 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
case FILE_MAGIC_GGUF_BE: case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian} c = &containerGGUF{ByteOrder: binary.BigEndian}
default: default:
return nil, errors.New("invalid file magic") return nil, 0, errors.New("invalid file magic")
} }
model, err := c.Decode(rs) model, err := c.Decode(rs)
if errors.Is(err, io.EOF) { if errors.Is(err, io.EOF) {
// noop // noop
} else if err != nil { } else if err != nil {
return nil, err return nil, 0, err
} }
offset, err := rs.Seek(0, io.SeekCurrent) offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil { if err != nil {
return nil, err return nil, 0, err
} }
// final model type // final model type
return &GGML{ return &GGML{
container: c, container: c,
model: model, model: model,
Size: offset, }, offset, nil
}, nil
} }

View file

@ -6,8 +6,6 @@ import (
"fmt" "fmt"
"io" "io"
"strings" "strings"
"github.com/ollama/ollama/format"
) )
type containerGGUF struct { type containerGGUF struct {
@ -90,8 +88,8 @@ const (
type gguf struct { type gguf struct {
*containerGGUF *containerGGUF
KV kv KV
Tensors []Tensor tensors []*Tensor
parameters uint64 parameters uint64
} }
@ -99,7 +97,7 @@ type gguf struct {
func newGGUF(container *containerGGUF) *gguf { func newGGUF(container *containerGGUF) *gguf {
return &gguf{ return &gguf{
containerGGUF: container, containerGGUF: container,
KV: make(KV), kv: make(KV),
} }
} }
@ -107,6 +105,14 @@ func NewGGUFV3(bo binary.ByteOrder) *gguf {
return newGGUF(&containerGGUF{ByteOrder: bo, Version: 3}) return newGGUF(&containerGGUF{ByteOrder: bo, Version: 3})
} }
func (llm *gguf) KV() KV {
return llm.kv
}
func (llm *gguf) Tensors() []*Tensor {
return llm.tensors
}
func (llm *gguf) numTensor() uint64 { func (llm *gguf) numTensor() uint64 {
switch llm.Version { switch llm.Version {
case 1: case 1:
@ -129,30 +135,6 @@ func (llm *gguf) numKV() uint64 {
} }
} }
func (llm *gguf) ModelFamily() string {
if t, ok := llm.KV["general.architecture"].(string); ok {
return t
}
return "unknown"
}
func (llm *gguf) ModelType() string {
if llm.parameters > 0 {
return format.HumanNumber(llm.parameters)
}
return "unknown"
}
func (llm *gguf) FileType() string {
if t, ok := llm.KV["general.file_type"].(uint32); ok {
return fileType(t)
}
return "unknown"
}
func (llm *gguf) Decode(rs io.ReadSeeker) error { func (llm *gguf) Decode(rs io.ReadSeeker) error {
// decode key-values // decode key-values
for i := 0; uint64(i) < llm.numKV(); i++ { for i := 0; uint64(i) < llm.numKV(); i++ {
@ -202,7 +184,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
return err return err
} }
llm.KV[k] = v llm.kv[k] = v
} }
// decode tensors // decode tensors
@ -243,11 +225,14 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
Shape: shape[:], Shape: shape[:],
} }
llm.Tensors = append(llm.Tensors, tensor) llm.tensors = append(llm.tensors, &tensor)
llm.parameters += tensor.parameters() llm.parameters += tensor.parameters()
} }
alignment, ok := llm.KV["general.alignment"].(uint32) // patch KV with parameter count
llm.kv["general.parameter_count"] = llm.parameters
alignment, ok := llm.kv["general.alignment"].(uint32)
if !ok { if !ok {
alignment = 32 alignment = 32
} }
@ -262,7 +247,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
return err return err
} }
for _, tensor := range llm.Tensors { for _, tensor := range llm.tensors {
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1) padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
if _, err := rs.Seek(padded, io.SeekCurrent); err != nil { if _, err := rs.Seek(padded, io.SeekCurrent); err != nil {
return err return err
@ -272,60 +257,6 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
return nil return nil
} }
func (llm *gguf) NumLayers() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumHead() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumEmbed() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumHeadKv() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumCtx() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumGQA() uint32 {
numHeadKv := llm.NumHeadKv()
if numHeadKv == 0 {
return 0
}
return llm.NumHead() / numHeadKv
}
func readGGUF[T any](llm *gguf, r io.Reader) (T, error) { func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
var t T var t T
err := binary.Read(r, llm.ByteOrder, &t) err := binary.Read(r, llm.ByteOrder, &t)

View file

@ -35,14 +35,14 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
} }
defer f.Close() defer f.Close()
ggml, err := DecodeGGML(f) ggml, size, err := DecodeGGML(f)
if err != nil { if err != nil {
return nil, err return nil, err
} }
if opts.NumCtx > int(ggml.NumCtx()) { if opts.NumCtx > int(ggml.KV().ContextLength()) {
slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx())) slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
opts.NumCtx = int(ggml.NumCtx()) opts.NumCtx = int(ggml.KV().ContextLength())
} }
if opts.NumCtx < 4 { if opts.NumCtx < 4 {
@ -50,18 +50,16 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
} }
vram, _ := gpu.CheckVRAM() vram, _ := gpu.CheckVRAM()
size := ggml.Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1)) kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
// this amount is the overhead + tensors in memory // this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of // TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa // estimating it's 1/6 * kv_cache_size * num_gqa
graph := int64(ggml.NumGQA()) * kv / 6 graph := int64(ggml.KV().GQA()) * kv / 6
// certain model architectures don't support gpu inference yet if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
opts.NumGPU = 0 opts.NumGPU = 0
} }
@ -105,7 +103,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
// 2. the proportional kv cache for all devices (kv * % layers) // 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices) // 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers // This estimates the number of layers
maxlayers := int64(ggml.NumLayers()) + 1 maxlayers := int64(ggml.KV().BlockCount()) + 1
devices := int64(info.DeviceCount) devices := int64(info.DeviceCount)
avg := vram / devices avg := vram / devices
layers := maxlayers * (avg - graph) / (kv + size/devices) layers := maxlayers * (avg - graph) / (kv + size/devices)

View file

@ -26,6 +26,7 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/convert" "github.com/ollama/ollama/convert"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version" "github.com/ollama/ollama/version"
@ -420,37 +421,32 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
defer bin.Close() defer bin.Close()
var offset int64 var offset int64
CREATE:
for { for {
fn(api.ProgressResponse{Status: "creating model layer"}) fn(api.ProgressResponse{Status: "creating model layer"})
if _, err := bin.Seek(offset, io.SeekStart); err != nil { if _, err := bin.Seek(offset, io.SeekStart); err != nil {
return err return err
} }
ggml, err := llm.DecodeGGML(bin) ggml, size, err := llm.DecodeGGML(bin)
if err != nil { if errors.Is(err, io.EOF) {
slog.Error(fmt.Sprintf("error decoding gguf file: %q", err)) break
switch { } else if errors.Is(err, llm.ErrUnsupportedFormat) {
case errors.Is(err, io.EOF): return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
break CREATE } else if err != nil {
case errors.Is(err, llm.ErrUnsupportedFormat): return err
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
default:
return err
}
} }
config.SetModelFormat(ggml.Name()) config.SetModelFormat(ggml.Name())
config.SetModelFamily(ggml.ModelFamily()) config.SetModelFamily(ggml.KV().Architecture())
config.SetModelType(ggml.ModelType()) config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
config.SetFileType(ggml.FileType()) config.SetFileType(ggml.KV().FileType())
mediatype := mediatype mediatype := mediatype
if ggml.ModelFamily() == "clip" { if ggml.KV().Architecture() == "clip" {
mediatype = "application/vnd.ollama.image.projector" mediatype = "application/vnd.ollama.image.projector"
} }
sr := io.NewSectionReader(bin, offset, ggml.Size) sr := io.NewSectionReader(bin, offset, size)
layer, err := NewLayer(sr, mediatype) layer, err := NewLayer(sr, mediatype)
if err != nil { if err != nil {
return err return err
@ -458,7 +454,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
layers.Add(layer) layers.Add(layer)
offset += ggml.Size offset += size
} }
case "adapter": case "adapter":
if strings.HasPrefix(c.Args, "@") { if strings.HasPrefix(c.Args, "@") {
@ -477,12 +473,12 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
} }
defer bin.Close() defer bin.Close()
ggml, err := llm.DecodeGGML(bin) _, size, err := llm.DecodeGGML(bin)
if err != nil { if err != nil {
return err return err
} }
sr := io.NewSectionReader(bin, 0, ggml.Size) sr := io.NewSectionReader(bin, 0, size)
layer, err := NewLayer(sr, mediatype) layer, err := NewLayer(sr, mediatype)
if err != nil { if err != nil {
return err return err
@ -554,13 +550,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
} }
} }
// xxx - can this be removed?
if config.ModelType == "65B" {
if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
config.ModelType = "70B"
}
}
var b bytes.Buffer var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(formattedParams); err != nil { if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
return err return err