refactor model parsing
This commit is contained in:
parent
011bb67351
commit
d338d70492
62
llm/ggla.go
62
llm/ggla.go
|
@ -35,7 +35,7 @@ type ggla struct {
|
|||
*containerGGLA
|
||||
|
||||
kv KV
|
||||
tensors []Tensor
|
||||
tensors []*Tensor
|
||||
}
|
||||
|
||||
func newGGLA(container *containerGGLA) *ggla {
|
||||
|
@ -45,18 +45,26 @@ func newGGLA(container *containerGGLA) *ggla {
|
|||
}
|
||||
}
|
||||
|
||||
func (m *ggla) decode(rs io.ReadSeeker) error {
|
||||
func (llm *ggla) KV() KV {
|
||||
return llm.kv
|
||||
}
|
||||
|
||||
func (llm *ggla) Tensors() []*Tensor {
|
||||
return llm.tensors
|
||||
}
|
||||
|
||||
func (llm *ggla) decode(rs io.ReadSeeker) error {
|
||||
var r uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
||||
return err
|
||||
}
|
||||
m.kv["r"] = r
|
||||
llm.kv["r"] = r
|
||||
|
||||
var alpha uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
|
||||
return err
|
||||
}
|
||||
m.kv["alpha"] = alpha
|
||||
llm.kv["alpha"] = alpha
|
||||
|
||||
for {
|
||||
var dims uint32
|
||||
|
@ -115,50 +123,6 @@ func (m *ggla) decode(rs io.ReadSeeker) error {
|
|||
return err
|
||||
}
|
||||
|
||||
m.tensors = append(m.tensors, t)
|
||||
llm.tensors = append(llm.tensors, &t)
|
||||
}
|
||||
}
|
||||
|
||||
func (m *ggla) KV() KV {
|
||||
return m.kv
|
||||
}
|
||||
|
||||
func (m *ggla) Tensor() []Tensor {
|
||||
return m.tensors
|
||||
}
|
||||
|
||||
func (*ggla) ModelFamily() string {
|
||||
return "ggla"
|
||||
}
|
||||
|
||||
func (*ggla) ModelType() string {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ggla) FileType() string {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ggla) NumLayers() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ggla) NumGQA() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ggla) NumEmbed() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ggla) NumHead() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ggla) NumHeadKv() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
||||
func (*ggla) NumCtx() uint32 {
|
||||
panic("not implemented")
|
||||
}
|
||||
|
|
100
llm/ggml.go
100
llm/ggml.go
|
@ -3,14 +3,13 @@ package llm
|
|||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
container
|
||||
model
|
||||
|
||||
Size int64
|
||||
}
|
||||
|
||||
const (
|
||||
|
@ -90,28 +89,82 @@ func fileType(fileType uint32) string {
|
|||
}
|
||||
|
||||
type model interface {
|
||||
ModelFamily() string
|
||||
ModelType() string
|
||||
FileType() string
|
||||
NumLayers() uint32
|
||||
NumGQA() uint32
|
||||
NumEmbed() uint32
|
||||
NumHead() uint32
|
||||
NumHeadKv() uint32
|
||||
NumCtx() uint32
|
||||
KV() KV
|
||||
Tensors() []*Tensor
|
||||
}
|
||||
|
||||
type KV map[string]any
|
||||
|
||||
func (kv KV) u64(key string) uint64 {
|
||||
switch v := kv[key].(type) {
|
||||
case uint64:
|
||||
return v
|
||||
case uint32:
|
||||
return uint64(v)
|
||||
case float64:
|
||||
return uint64(v)
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func (kv KV) Architecture() string {
|
||||
if s, ok := kv["general.architecture"].(string); ok {
|
||||
return s
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (kv KV) ParameterCount() uint64 {
|
||||
return kv.u64("general.parameter_count")
|
||||
}
|
||||
|
||||
func (kv KV) FileType() string {
|
||||
if u64 := kv.u64("general.file_type"); u64 > 0 {
|
||||
return fileType(uint32(u64))
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (kv KV) BlockCount() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
|
||||
}
|
||||
|
||||
func (kv KV) HeadCount() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
|
||||
}
|
||||
|
||||
func (kv KV) HeadCountKV() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture()))
|
||||
}
|
||||
|
||||
func (kv KV) GQA() uint64 {
|
||||
if headCountKV := kv.HeadCountKV(); headCountKV > 0 {
|
||||
return kv.HeadCount() / headCountKV
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func (kv KV) EmbeddingLength() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
|
||||
}
|
||||
|
||||
func (kv KV) ContextLength() uint64 {
|
||||
return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
|
||||
}
|
||||
|
||||
type Tensor struct {
|
||||
Name string
|
||||
Kind uint32
|
||||
Offset uint64
|
||||
Name string `json:"name"`
|
||||
Kind uint32 `json:"kind"`
|
||||
Offset uint64 `json:"-"`
|
||||
|
||||
// Shape is the number of elements in each dimension
|
||||
Shape []uint64
|
||||
Shape []uint64 `json:"shape"`
|
||||
|
||||
io.WriterTo
|
||||
io.WriterTo `json:"-"`
|
||||
}
|
||||
|
||||
func (t Tensor) blockSize() uint64 {
|
||||
|
@ -201,16 +254,16 @@ const (
|
|||
|
||||
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||
|
||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
|
||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
||||
var magic uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
var c container
|
||||
switch magic {
|
||||
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
|
||||
return nil, ErrUnsupportedFormat
|
||||
return nil, 0, ErrUnsupportedFormat
|
||||
case FILE_MAGIC_GGLA:
|
||||
c = &containerGGLA{}
|
||||
case FILE_MAGIC_GGUF_LE:
|
||||
|
@ -218,25 +271,24 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, error) {
|
|||
case FILE_MAGIC_GGUF_BE:
|
||||
c = &containerGGUF{ByteOrder: binary.BigEndian}
|
||||
default:
|
||||
return nil, errors.New("invalid file magic")
|
||||
return nil, 0, errors.New("invalid file magic")
|
||||
}
|
||||
|
||||
model, err := c.Decode(rs)
|
||||
if errors.Is(err, io.EOF) {
|
||||
// noop
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// final model type
|
||||
return &GGML{
|
||||
container: c,
|
||||
model: model,
|
||||
Size: offset,
|
||||
}, nil
|
||||
}, offset, nil
|
||||
}
|
||||
|
|
105
llm/gguf.go
105
llm/gguf.go
|
@ -6,8 +6,6 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
||||
type containerGGUF struct {
|
||||
|
@ -90,8 +88,8 @@ const (
|
|||
type gguf struct {
|
||||
*containerGGUF
|
||||
|
||||
KV
|
||||
Tensors []Tensor
|
||||
kv KV
|
||||
tensors []*Tensor
|
||||
|
||||
parameters uint64
|
||||
}
|
||||
|
@ -99,7 +97,7 @@ type gguf struct {
|
|||
func newGGUF(container *containerGGUF) *gguf {
|
||||
return &gguf{
|
||||
containerGGUF: container,
|
||||
KV: make(KV),
|
||||
kv: make(KV),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -107,6 +105,14 @@ func NewGGUFV3(bo binary.ByteOrder) *gguf {
|
|||
return newGGUF(&containerGGUF{ByteOrder: bo, Version: 3})
|
||||
}
|
||||
|
||||
func (llm *gguf) KV() KV {
|
||||
return llm.kv
|
||||
}
|
||||
|
||||
func (llm *gguf) Tensors() []*Tensor {
|
||||
return llm.tensors
|
||||
}
|
||||
|
||||
func (llm *gguf) numTensor() uint64 {
|
||||
switch llm.Version {
|
||||
case 1:
|
||||
|
@ -129,30 +135,6 @@ func (llm *gguf) numKV() uint64 {
|
|||
}
|
||||
}
|
||||
|
||||
func (llm *gguf) ModelFamily() string {
|
||||
if t, ok := llm.KV["general.architecture"].(string); ok {
|
||||
return t
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (llm *gguf) ModelType() string {
|
||||
if llm.parameters > 0 {
|
||||
return format.HumanNumber(llm.parameters)
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (llm *gguf) FileType() string {
|
||||
if t, ok := llm.KV["general.file_type"].(uint32); ok {
|
||||
return fileType(t)
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||
// decode key-values
|
||||
for i := 0; uint64(i) < llm.numKV(); i++ {
|
||||
|
@ -202,7 +184,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||
return err
|
||||
}
|
||||
|
||||
llm.KV[k] = v
|
||||
llm.kv[k] = v
|
||||
}
|
||||
|
||||
// decode tensors
|
||||
|
@ -243,11 +225,14 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||
Shape: shape[:],
|
||||
}
|
||||
|
||||
llm.Tensors = append(llm.Tensors, tensor)
|
||||
llm.tensors = append(llm.tensors, &tensor)
|
||||
llm.parameters += tensor.parameters()
|
||||
}
|
||||
|
||||
alignment, ok := llm.KV["general.alignment"].(uint32)
|
||||
// patch KV with parameter count
|
||||
llm.kv["general.parameter_count"] = llm.parameters
|
||||
|
||||
alignment, ok := llm.kv["general.alignment"].(uint32)
|
||||
if !ok {
|
||||
alignment = 32
|
||||
}
|
||||
|
@ -262,7 +247,7 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||
return err
|
||||
}
|
||||
|
||||
for _, tensor := range llm.Tensors {
|
||||
for _, tensor := range llm.tensors {
|
||||
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
|
||||
if _, err := rs.Seek(padded, io.SeekCurrent); err != nil {
|
||||
return err
|
||||
|
@ -272,60 +257,6 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (llm *gguf) NumLayers() uint32 {
|
||||
value, exists := llm.KV[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *gguf) NumHead() uint32 {
|
||||
value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *gguf) NumEmbed() uint32 {
|
||||
value, exists := llm.KV[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *gguf) NumHeadKv() uint32 {
|
||||
value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *gguf) NumCtx() uint32 {
|
||||
value, exists := llm.KV[fmt.Sprintf("%s.context_length", llm.ModelFamily())]
|
||||
if !exists {
|
||||
return 0
|
||||
}
|
||||
|
||||
return value.(uint32)
|
||||
}
|
||||
|
||||
func (llm *gguf) NumGQA() uint32 {
|
||||
numHeadKv := llm.NumHeadKv()
|
||||
if numHeadKv == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return llm.NumHead() / numHeadKv
|
||||
}
|
||||
|
||||
func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
|
||||
var t T
|
||||
err := binary.Read(r, llm.ByteOrder, &t)
|
||||
|
|
18
llm/llm.go
18
llm/llm.go
|
@ -35,14 +35,14 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
|||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, err := DecodeGGML(f)
|
||||
ggml, size, err := DecodeGGML(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if opts.NumCtx > int(ggml.NumCtx()) {
|
||||
slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
|
||||
opts.NumCtx = int(ggml.NumCtx())
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
|
@ -50,18 +50,16 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
|||
}
|
||||
|
||||
vram, _ := gpu.CheckVRAM()
|
||||
size := ggml.Size
|
||||
|
||||
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
|
||||
|
||||
// this amount is the overhead + tensors in memory
|
||||
// TODO: get this from the llama.cpp's graph calculations instead of
|
||||
// estimating it's 1/6 * kv_cache_size * num_gqa
|
||||
graph := int64(ggml.NumGQA()) * kv / 6
|
||||
graph := int64(ggml.KV().GQA()) * kv / 6
|
||||
|
||||
// certain model architectures don't support gpu inference yet
|
||||
if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
|
||||
if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
|
||||
opts.NumGPU = 0
|
||||
}
|
||||
|
||||
|
@ -105,7 +103,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
|||
// 2. the proportional kv cache for all devices (kv * % layers)
|
||||
// 3. the proportional model (size * % layers / # devices)
|
||||
// This estimates the number of layers
|
||||
maxlayers := int64(ggml.NumLayers()) + 1
|
||||
maxlayers := int64(ggml.KV().BlockCount()) + 1
|
||||
devices := int64(info.DeviceCount)
|
||||
avg := vram / devices
|
||||
layers := maxlayers * (avg - graph) / (kv + size/devices)
|
||||
|
|
|
@ -26,6 +26,7 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/convert"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/version"
|
||||
|
@ -420,37 +421,32 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
|||
defer bin.Close()
|
||||
|
||||
var offset int64
|
||||
CREATE:
|
||||
for {
|
||||
fn(api.ProgressResponse{Status: "creating model layer"})
|
||||
if _, err := bin.Seek(offset, io.SeekStart); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
ggml, err := llm.DecodeGGML(bin)
|
||||
if err != nil {
|
||||
slog.Error(fmt.Sprintf("error decoding gguf file: %q", err))
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
break CREATE
|
||||
case errors.Is(err, llm.ErrUnsupportedFormat):
|
||||
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
|
||||
default:
|
||||
return err
|
||||
}
|
||||
ggml, size, err := llm.DecodeGGML(bin)
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if errors.Is(err, llm.ErrUnsupportedFormat) {
|
||||
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
|
||||
} else if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
config.SetModelFormat(ggml.Name())
|
||||
config.SetModelFamily(ggml.ModelFamily())
|
||||
config.SetModelType(ggml.ModelType())
|
||||
config.SetFileType(ggml.FileType())
|
||||
config.SetModelFamily(ggml.KV().Architecture())
|
||||
config.SetModelType(format.HumanNumber(ggml.KV().ParameterCount()))
|
||||
config.SetFileType(ggml.KV().FileType())
|
||||
|
||||
mediatype := mediatype
|
||||
if ggml.ModelFamily() == "clip" {
|
||||
if ggml.KV().Architecture() == "clip" {
|
||||
mediatype = "application/vnd.ollama.image.projector"
|
||||
}
|
||||
|
||||
sr := io.NewSectionReader(bin, offset, ggml.Size)
|
||||
sr := io.NewSectionReader(bin, offset, size)
|
||||
layer, err := NewLayer(sr, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -458,7 +454,7 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
|||
|
||||
layers.Add(layer)
|
||||
|
||||
offset += ggml.Size
|
||||
offset += size
|
||||
}
|
||||
case "adapter":
|
||||
if strings.HasPrefix(c.Args, "@") {
|
||||
|
@ -477,12 +473,12 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
|||
}
|
||||
defer bin.Close()
|
||||
|
||||
ggml, err := llm.DecodeGGML(bin)
|
||||
_, size, err := llm.DecodeGGML(bin)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
sr := io.NewSectionReader(bin, 0, ggml.Size)
|
||||
sr := io.NewSectionReader(bin, 0, size)
|
||||
layer, err := NewLayer(sr, mediatype)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -554,13 +550,6 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
|||
}
|
||||
}
|
||||
|
||||
// xxx - can this be removed?
|
||||
if config.ModelType == "65B" {
|
||||
if gqa, ok := formattedParams["gqa"].(int); ok && gqa == 8 {
|
||||
config.ModelType = "70B"
|
||||
}
|
||||
}
|
||||
|
||||
var b bytes.Buffer
|
||||
if err := json.NewEncoder(&b).Encode(formattedParams); err != nil {
|
||||
return err
|
||||
|
|
Loading…
Reference in a new issue