refactor model parsing

This commit is contained in:
Michael Yang 2024-03-13 11:03:56 -07:00
parent 011bb67351
commit d338d70492
5 changed files with 131 additions and 197 deletions

View file

@ -35,7 +35,7 @@ type ggla struct {
*containerGGLA
kv KV
tensors []Tensor
tensors []*Tensor
}
func newGGLA(container *containerGGLA) *ggla {
@ -45,18 +45,26 @@ func newGGLA(container *containerGGLA) *ggla {
}
}
func (m *ggla) decode(rs io.ReadSeeker) error {
func (llm *ggla) KV() KV {
return llm.kv
}
func (llm *ggla) Tensors() []*Tensor {
return llm.tensors
}
func (llm *ggla) decode(rs io.ReadSeeker) error {
var r uint32
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
return err
}
m.kv["r"] = r
llm.kv["r"] = r
var alpha uint32
if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
return err
}
m.kv["alpha"] = alpha
llm.kv["alpha"] = alpha
for {
var dims uint32
@ -115,50 +123,6 @@ func (m *ggla) decode(rs io.ReadSeeker) error {
return err
}
m.tensors = append(m.tensors, t)
llm.tensors = append(llm.tensors, &t)
}
}
func (m *ggla) KV() KV {
return m.kv
}
func (m *ggla) Tensor() []Tensor {
return m.tensors
}
func (*ggla) ModelFamily() string {
return "ggla"
}
func (*ggla) ModelType() string {
panic("not implemented")
}
func (*ggla) FileType() string {
panic("not implemented")
}
func (*ggla) NumLayers() uint32 {
panic("not implemented")
}
func (*ggla) NumGQA() uint32 {
panic("not implemented")
}
func (*ggla) NumEmbed() uint32 {
panic("not implemented")
}
func (*ggla) NumHead() uint32 {
panic("not implemented")
}
func (*ggla) NumHeadKv() uint32 {
panic("not implemented")
}
func (*ggla) NumCtx() uint32 {
panic("not implemented")
}

View file

@ -3,14 +3,13 @@ package llm
import (
"encoding/binary"
"errors"
"fmt"
"io"
)
type GGML struct {
container
model
Size int64
}
const (
@ -90,28 +89,82 @@ func fileType(fileType uint32) string {
}
type model interface {
ModelFamily() string
ModelType() string
FileType() string
NumLayers() uint32
NumGQA() uint32
NumEmbed() uint32
NumHead() uint32
NumHeadKv() uint32
NumCtx() uint32
KV() KV
Tensors() []*Tensor
}
type KV map[string]any
func (kv KV) u64(key string) uint64 {
switch v := kv[key].(type) {
case uint64:
return v
case uint32:
return uint64(v)
case float64:
return uint64(v)
default:
return 0
}
}
func (kv KV) Architecture() string {
if s, ok := kv["general.architecture"].(string); ok {
return s
}
return "unknown"
}
func (kv KV) ParameterCount() uint64 {
return kv.u64("general.parameter_count")
}
func (kv KV) FileType() string {
if u64 := kv.u64("general.file_type"); u64 > 0 {
return fileType(uint32(u64))
}
return "unknown"
}
func (kv KV) BlockCount() uint64 {
return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
}
func (kv KV) HeadCount() uint64 {
return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
}
func (kv KV) HeadCountKV() uint64 {
return kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture()))
}
func (kv KV) GQA() uint64 {
if headCountKV := kv.HeadCountKV(); headCountKV > 0 {
return kv.HeadCount() / headCountKV
}
return 0
}
func (kv KV) EmbeddingLength() uint64 {
return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
}
func (kv KV) ContextLength() uint64 {
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
}
type Tensor struct {
Name string
Kind uint32
Offset uint64
Name string `json:"name"`
Kind uint32 `json:"kind"`
Offset uint64 `json:"-"`
// Shape is the number of elements in each dimension
Shape []uint64
Shape []uint64 `json:"shape"`
io.WriterTo
io.WriterTo `json:"-"`
}
func (t Tensor) blockSize() uint64 {
@ -201,16 +254,16 @@ const (
var ErrUnsupportedFormat = errors.New("unsupported model format")
func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, err
return nil, 0, err
}
var c container
switch magic {
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
return nil, ErrUnsupportedFormat
return nil, 0, ErrUnsupportedFormat
case FILE_MAGIC_GGLA:
c = &containerGGLA{}
case FILE_MAGIC_GGUF_LE:
@ -218,25 +271,24 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian}
default:
return nil, errors.New("invalid file magic")
return nil, 0, errors.New("invalid file magic")
}
model, err := c.Decode(rs)
if errors.Is(err, io.EOF) {
// noop
} else if err != nil {
return nil, err
return nil, 0, err
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return nil, err
return nil, 0, err
}
// final model type
return &GGML{
container: c,
model: model,
Size: offset,
}, nil
}, offset, nil
}

View file

@ -6,8 +6,6 @@ import (
"fmt"
"io"
"strings"
"github.com/ollama/ollama/format"
)
type containerGGUF struct {
@ -90,8 +88,8 @@ const (
type gguf struct {
*containerGGUF
KV
Tensors []Tensor
kv KV
tensors []*Tensor
parameters uint64
}
@ -99,7 +97,7 @@ type gguf struct {
func newGGUF(container *containerGGUF) *gguf {
return &gguf{
containerGGUF: container,
KV: make(KV),
kv: make(KV),
}
}
@ -107,6 +105,14 @@ func NewGGUFV3(bo binary.ByteOrder) *gguf {
return newGGUF(&containerGGUF{ByteOrder: bo, Version: 3})
}
func (llm *gguf) KV() KV {
return llm.kv
}
func (llm *gguf) Tensors() []*Tensor {
return llm.tensors
}
func (llm *gguf) numTensor() uint64 {
switch llm.Version {
case 1:
@ -129,30 +135,6 @@ func (llm *gguf) numKV() uint64 {
}
}
func (llm *gguf) ModelFamily() string {
if t, ok := llm.KV["general.architecture"].(string); ok {
return t
}
return "unknown"
}
func (llm *gguf) ModelType() string {
if llm.parameters > 0 {
return format.HumanNumber(llm.parameters)
}
return "unknown"
}
func (llm *gguf) FileType() string {
if t, ok := llm.KV["general.file_type"].(uint32); ok {
return fileType(t)
}
return "unknown"
}
func (llm *gguf) Decode(rs io.ReadSeeker) error {
// decode key-values
for i := 0; uint64(i) < llm.numKV(); i++ {
@ -202,7 +184,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
return err
}
llm.KV[k] = v
llm.kv[k] = v
}
// decode tensors
@ -243,11 +225,14 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
Shape: shape[:],
}
llm.Tensors = append(llm.Tensors, tensor)
llm.tensors = append(llm.tensors, &tensor)
llm.parameters += tensor.parameters()
}
alignment, ok := llm.KV["general.alignment"].(uint32)
// patch KV with parameter count
llm.kv["general.parameter_count"] = llm.parameters
alignment, ok := llm.kv["general.alignment"].(uint32)
if !ok {
alignment = 32
}
@ -262,7 +247,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
return err
}
for _, tensor := range llm.Tensors {
for _, tensor := range llm.tensors {
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
if _, err := rs.Seek(padded, io.SeekCurrent); err != nil {
return err
@ -272,60 +257,6 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
return nil
}
func (llm *gguf) NumLayers() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumHead() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumEmbed() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumHeadKv() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumCtx() uint32 {
value, exists := llm.KV[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
if !exists {
return 0
}
return value.(uint32)
}
func (llm *gguf) NumGQA() uint32 {
numHeadKv := llm.NumHeadKv()
if numHeadKv == 0 {
return 0
}
return llm.NumHead() / numHeadKv
}
func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
var t T
err := binary.Read(r, llm.ByteOrder, &t)

View file

@ -35,14 +35,14 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
}
defer f.Close()
ggml, err := DecodeGGML(f)
ggml, size, err := DecodeGGML(f)
if err != nil {
return nil, err
}
if opts.NumCtx > int(ggml.NumCtx()) {
slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
opts.NumCtx = int(ggml.NumCtx())
if opts.NumCtx > int(ggml.KV().ContextLength()) {
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
opts.NumCtx = int(ggml.KV().ContextLength())
}
if opts.NumCtx < 4 {
@ -50,18 +50,16 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
}
vram, _ := gpu.CheckVRAM()
size := ggml.Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
graph := int64(ggml.NumGQA()) * kv / 6
graph := int64(ggml.KV().GQA()) * kv / 6
// certain model architectures don't support gpu inference yet
if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
opts.NumGPU = 0
}
@ -105,7 +103,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
// 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers
maxlayers := int64(ggml.NumLayers()) + 1
maxlayers := int64(ggml.KV().BlockCount()) + 1
devices := int64(info.DeviceCount)
avg := vram / devices
layers := maxlayers * (avg - graph) / (kv + size/devices)

View file

@ -26,6 +26,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/convert"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/llm"
"github.com/ollama/ollama/parser"
"github.com/ollama/ollama/version"
@ -420,37 +421,32 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
defer bin.Close()
var offset int64
CREATE:
for {
fn(api.ProgressResponse{Status: "creating model layer"})
if _, err := bin.Seek(offset, io.SeekStart); err != nil {
return err
}
ggml, err := llm.DecodeGGML(bin)
if err != nil {
slog.Error(fmt.Sprintf("error decoding gguf file: %q", err))
switch {
case errors.Is(err, io.EOF):
break CREATE
case errors.Is(err, llm.ErrUnsupportedFormat):
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
default:
return err
}
ggml, size, err := llm.DecodeGGML(bin)
if errors.Is(err, io.EOF) {
break
} else if errors.Is(err, llm.ErrUnsupportedFormat) {
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
} else if err != nil {
return err
}
config.SetModelFormat(ggml.Name())
config.SetModelFamily(ggml.ModelFamily())
config.SetModelType(ggml.ModelType())
config.SetFileType(ggml.FileType())
config.SetModelFamily(ggml.KV().Architecture())
config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
config.SetFileType(ggml.KV().FileType())
mediatype := mediatype
if ggml.ModelFamily() == "clip" {
if ggml.KV().Architecture() == "clip" {
mediatype = "application/vnd.ollama.image.projector"
}
sr := io.NewSectionReader(bin, offset, ggml.Size)
sr := io.NewSectionReader(bin, offset, size)
layer, err := NewLayer(sr, mediatype)
if err != nil {
return err
@ -458,7 +454,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
layers.Add(layer)
offset += ggml.Size
offset += size
}
case "adapter":
if strings.HasPrefix(c.Args, "@") {
@ -477,12 +473,12 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
}
defer bin.Close()
ggml, err := llm.DecodeGGML(bin)
_, size, err := llm.DecodeGGML(bin)
if err != nil {
return err
}
sr := io.NewSectionReader(bin, 0, ggml.Size)
sr := io.NewSectionReader(bin, 0, size)
layer, err := NewLayer(sr, mediatype)
if err != nil {
return err
@ -554,13 +550,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
}
}
// xxx - can this be removed?
if config.ModelType == "65B" {
if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
config.ModelType = "70B"
}
}
var b bytes.Buffer
if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
return err