diff --git a/cmd/cmd.go b/cmd/cmd.go index a8a02605..b75c0b5e 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) { // safetensors files might be unresolved git lfs references; skip if they are // covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors files = append(files, st...) + } else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 { + // covers adapters.safetensors + files = append(files, st...) + } else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 { + // covers adapter_model.safetensors + files = append(files, st...) } else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 { // pytorch files might also be unresolved git lfs references; skip if they are // covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin diff --git a/convert/convert.go b/convert/convert.go index 5a314cdd..8c7b0943 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -12,12 +12,22 @@ import ( "github.com/ollama/ollama/llm" ) -type Parameters struct { +type ModelParameters struct { Architectures []string `json:"architectures"` VocabSize uint32 `json:"vocab_size"` } -func (Parameters) KV(t *Tokenizer) llm.KV { +type AdapterParameters struct { + Alpha uint32 `json:"lora_alpha"` + LoraLayers uint32 `json:"lora_layers"` + LoraParameters struct { + Rank uint32 `json:"rank"` + Alpha float32 `json:"alpha"` + Scale float32 `json:"scale"` + } `json:"lora_parameters"` +} + +func (ModelParameters) KV(t *Tokenizer) llm.KV { kv := llm.KV{ "general.file_type": uint32(1), "general.quantization_version": uint32(2), @@ -44,17 +54,40 @@ func (Parameters) KV(t *Tokenizer) llm.KV { return kv } -func (Parameters) specialTokenTypes() []string { +func (p AdapterParameters) KV() llm.KV { + var alpha float32 + if p.LoraParameters.Alpha == 0 { + alpha = float32(p.Alpha) + } else { + alpha = p.LoraParameters.Alpha + } + + kv := llm.KV{ + "adapter.lora.alpha": alpha, + "adapter.type": "lora", + "general.file_type": uint32(1), + "general.type": "adapter", + "general.version": "v0.2", + } + + return kv +} + +func (ModelParameters) specialTokenTypes() []string { return []string{ "bos", "eos", "unk", "sep", "pad", "cls", "mask", } } -func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { +func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { return llm.WriteGGUF(ws, kv, ts) } -type Converter interface { +func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { + return llm.WriteGGUF(ws, kv, ts) +} + +type ModelConverter interface { // KV maps parameters to LLM key-values KV(*Tokenizer) llm.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. @@ -73,17 +106,67 @@ type moreParser interface { parseMore(fs.FS) error } +type AdapterConverter interface { + // KV maps parameters to LLM key-values + KV(llm.KV) llm.KV + // Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here. + Tensors([]Tensor) []llm.Tensor + // Replacements returns a list of string pairs to replace in tensor names. + // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details + Replacements() []string + + writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error +} + +func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error { + bts, err := fs.ReadFile(fsys, "adapter_config.json") + if err != nil { + return err + } + + var p AdapterParameters + if err := json.Unmarshal(bts, &p); err != nil { + return err + } + + arch, ok := baseKV["general.architecture"] + if !ok { + return errors.New("architecture not set for the base model") + } + + var conv AdapterConverter + switch arch { + case "llama": + conv = &llamaAdapter{} + case "gemma2": + conv = &gemma2Adapter{} + default: + return errors.New("unsupported architecture") + } + + ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...)) + if err != nil { + return err + } + + if err := json.Unmarshal(bts, conv); err != nil { + return err + } + + return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts)) +} + // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model. -func Convert(fsys fs.FS, ws io.WriteSeeker) error { +func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { bts, err := fs.ReadFile(fsys, "config.json") if err != nil { return err } - var p Parameters + var p ModelParameters if err := json.Unmarshal(bts, &p); err != nil { return err } @@ -92,20 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { return errors.New("unknown architecture") } - var conv Converter + var conv ModelConverter switch p.Architectures[0] { case "LlamaForCausalLM", "MistralForCausalLM": - conv = &llama{} + conv = &llamaModel{} case "MixtralForCausalLM": - conv = &mixtral{} + conv = &mixtralModel{} case "GemmaForCausalLM": - conv = &gemma{} + conv = &gemmaModel{} case "Gemma2ForCausalLM": - conv = &gemma2{} + conv = &gemma2Model{} case "Phi3ForCausalLM": - conv = &phi3{} + conv = &phi3Model{} case "BertModel": - conv = &bert{} + conv = &bertModel{} default: return errors.New("unsupported architecture") } diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 6e7d59fe..ea5facaa 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -11,8 +11,8 @@ import ( "github.com/ollama/ollama/llm" ) -type bert struct { - Parameters +type bertModel struct { + ModelParameters NLayers uint32 `json:"n_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"` NLayer uint32 `json:"n_layer"` @@ -33,11 +33,11 @@ type bert struct { } var ( - _ Converter = (*bert)(nil) - _ moreParser = (*bert)(nil) + _ ModelConverter = (*bertModel)(nil) + _ moreParser = (*bertModel)(nil) ) -func (p *bert) parseMore(fsys fs.FS) error { +func (p *bertModel) parseMore(fsys fs.FS) error { bts, err := fs.ReadFile(fsys, "modules.json") if err != nil { return err @@ -85,8 +85,8 @@ func (p *bert) parseMore(fsys fs.FS) error { return nil } -func (p *bert) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *bertModel) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "bert" kv["bert.attention.causal"] = false kv["bert.pooling_type"] = p.PoolingType @@ -132,7 +132,7 @@ func (p *bert) KV(t *Tokenizer) llm.KV { return kv } -func (p *bert) Tensors(ts []Tensor) []llm.Tensor { +func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor for _, t := range ts { if slices.Contains([]string{ @@ -154,7 +154,7 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor { return out } -func (bert) Replacements() []string { +func (bertModel) Replacements() []string { return []string{ "encoder.layer", "blk", "encoder.layers", "blk", diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index c4316808..b8865294 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -9,8 +9,8 @@ import ( "github.com/ollama/ollama/llm" ) -type gemma struct { - Parameters +type gemmaModel struct { + ModelParameters MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` HiddenSize uint32 `json:"hidden_size"` HiddenLayers uint32 `json:"num_hidden_layers"` @@ -21,10 +21,10 @@ type gemma struct { HeadDim uint32 `json:"head_dim"` } -var _ Converter = (*gemma)(nil) +var _ ModelConverter = (*gemmaModel)(nil) -func (p *gemma) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *gemmaModel) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.embedding_length"] = p.HiddenSize @@ -42,8 +42,8 @@ func (p *gemma) KV(t *Tokenizer) llm.KV { return kv } -func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { - out := make([]llm.Tensor, 0, len(ts)) +func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor for _, t := range ts { if strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) @@ -60,7 +60,7 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *gemma) Replacements() []string { +func (p *gemmaModel) Replacements() []string { return []string{ "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -77,7 +77,7 @@ func (p *gemma) Replacements() []string { } } -func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { +func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data)) ones := tensor.Ones(tensor.Float32, int(shape[0])) diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go index 084f9c52..c4ee2d09 100644 --- a/convert/convert_gemma2.go +++ b/convert/convert_gemma2.go @@ -4,15 +4,15 @@ import ( "github.com/ollama/ollama/llm" ) -type gemma2 struct { - gemma +type gemma2Model struct { + gemmaModel SlidingWindow uint32 `json:"sliding_window"` AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"` FinalLogitSoftcap float32 `json:"final_logit_softcapping"` } -func (p *gemma2) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *gemma2Model) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "gemma2" kv["gemma2.context_length"] = p.MaxPositionEmbeddings kv["gemma2.embedding_length"] = p.HiddenSize @@ -33,9 +33,9 @@ func (p *gemma2) KV(t *Tokenizer) llm.KV { return kv } -func (p *gemma2) Replacements() []string { +func (p *gemma2Model) Replacements() []string { return append( - p.gemma.Replacements(), + p.gemmaModel.Replacements(), "post_attention_layernorm", "post_attention_norm", "pre_feedforward_layernorm", "ffn_norm", "post_feedforward_layernorm", "post_ffw_norm", diff --git a/convert/convert_gemma2_adapter.go b/convert/convert_gemma2_adapter.go new file mode 100644 index 00000000..a89a25f4 --- /dev/null +++ b/convert/convert_gemma2_adapter.go @@ -0,0 +1,91 @@ +package convert + +import ( + "strings" + + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" + + "github.com/ollama/ollama/llm" +) + +type gemma2Adapter struct { + AdapterParameters +} + +var _ AdapterConverter = (*gemma2Adapter)(nil) + +func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV { + kv := p.AdapterParameters.KV() + kv["general.architecture"] = "gemma2" + return kv +} + +func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + shape := t.Shape() + if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || + (strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) { + shape[0], shape[1] = shape[1], shape[0] + t.SetRepacker(p.repack) + } + + out = append(out, llm.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (p *gemma2Adapter) Replacements() []string { + return []string{ + "base_model.model.", "", + "model.layers", "blk", + "self_attn.q_proj", "attn_q", + "self_attn.k_proj", "attn_k", + "self_attn.v_proj", "attn_v", + "self_attn.o_proj", "attn_output", + "mlp.gate_proj", "ffn_gate", + "mlp.down_proj", "ffn_down", + "mlp.up_proj", "ffn_up", + "lora_A.weight", "weight.lora_a", + "lora_B.weight", "weight.lora_b", + "lora_a", "weight.lora_a", + "lora_b", "weight.lora_b", + } +} + +func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) { + dims := []int{int(shape[1]), int(shape[0])} + + n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + if err := n.T(1, 0); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + + ts, err := native.SelectF32(n, 1) + if err != nil { + return nil, err + } + + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) + } + + return f32s, nil +} diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 27f924fb..5dedb829 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -12,8 +12,8 @@ import ( "github.com/ollama/ollama/llm" ) -type llama struct { - Parameters +type llamaModel struct { + ModelParameters NLayers uint32 `json:"n_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"` NLayer uint32 `json:"n_layer"` @@ -44,10 +44,10 @@ type llama struct { HeadDim uint32 `json:"head_dim"` } -var _ Converter = (*llama)(nil) +var _ ModelConverter = (*llamaModel)(nil) -func (p *llama) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *llamaModel) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "llama" kv["llama.vocab_size"] = p.VocabSize @@ -120,7 +120,7 @@ func (p *llama) KV(t *Tokenizer) llm.KV { return kv } -func (p *llama) Tensors(ts []Tensor) []llm.Tensor { +func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor if p.RopeScaling.factors != nil { @@ -149,7 +149,7 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *llama) Replacements() []string { +func (p *llamaModel) Replacements() []string { return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", @@ -167,7 +167,7 @@ func (p *llama) Replacements() []string { } } -func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) { +func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) { var dims []int for _, dim := range shape { dims = append(dims, int(dim)) diff --git a/convert/convert_llama_adapter.go b/convert/convert_llama_adapter.go new file mode 100644 index 00000000..08ddee10 --- /dev/null +++ b/convert/convert_llama_adapter.go @@ -0,0 +1,169 @@ +package convert + +import ( + "cmp" + "strings" + + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" + + "github.com/ollama/ollama/llm" +) + +type llamaAdapter struct { + AdapterParameters + NumAttentionHeads uint32 `json:"num_attention_heads"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` +} + +var _ AdapterConverter = (*llamaAdapter)(nil) + +func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV { + kv := p.AdapterParameters.KV() + kv["general.architecture"] = "llama" + kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"] + kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"] + + p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32) + + return kv +} + +func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + shape := t.Shape() + if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || + (strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) { + shape[0], shape[1] = shape[1], shape[0] + t.SetRepacker(p.repackAndTranspose) + } else { + t.SetRepacker(p.repack) + } + + out = append(out, llm.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: shape, + WriterTo: t, + }) + } + + return out +} + +func (p *llamaAdapter) Replacements() []string { + return []string{ + "base_model.model.", "", + "model.layers", "blk", + "self_attn.q_proj", "attn_q", + "self_attn.k_proj", "attn_k", + "self_attn.v_proj", "attn_v", + "self_attn.o_proj", "attn_output", + "mlp.gate_proj", "ffn_gate", + "mlp.down_proj", "ffn_down", + "mlp.up_proj", "ffn_up", + "lora_A.weight", "weight.lora_a", + "lora_B.weight", "weight.lora_b", + "lora_a", "weight.lora_a", + "lora_b", "weight.lora_b", + } +} + +func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) { + dims := []int{int(shape[1]), int(shape[0])} + + var heads uint32 + if strings.HasSuffix(name, "attn_q.weight.lora_a") { + heads = p.NumAttentionHeads + } else if strings.HasSuffix(name, "attn_k.weight.lora_a") { + heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) + } else { + return data, nil + } + + n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil { + return nil, err + } + + if err := n.T(0, 2, 1, 3); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + + ts, err := native.SelectF32(n, 1) + if err != nil { + return nil, err + } + + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) + } + + return f32s, nil +} + +func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) { + dims := []int{int(shape[1]), int(shape[0])} + + n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + var heads uint32 + if strings.HasSuffix(name, "attn_q.weight.lora_a") { + heads = p.NumAttentionHeads + } else if strings.HasSuffix(name, "attn_k.weight.lora_a") { + heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) + } + + if heads > 0 { + if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil { + return nil, err + } + + if err := n.T(0, 2, 1, 3); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + } + + if err := n.T(1, 0); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + + ts, err := native.SelectF32(n, 1) + if err != nil { + return nil, err + } + + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) + } + + return f32s, nil +} diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 97a86b30..43b7c8b1 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -9,14 +9,14 @@ import ( "github.com/ollama/ollama/llm" ) -type mixtral struct { - llama +type mixtralModel struct { + llamaModel NumLocalExperts uint32 `json:"num_local_experts"` NumExpertsPerToken uint32 `json:"num_experts_per_tok"` } -func (p *mixtral) KV(t *Tokenizer) llm.KV { - kv := p.llama.KV(t) +func (p *mixtralModel) KV(t *Tokenizer) llm.KV { + kv := p.llamaModel.KV(t) if p.NumLocalExperts > 0 { kv["llama.expert_count"] = p.NumLocalExperts @@ -29,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV { return kv } -func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { +func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor { oldnew := []string{ "model.layers", "blk", "w1", "ffn_gate_exps", @@ -67,12 +67,12 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { }) } - return append(out, p.llama.Tensors(ts)...) + return append(out, p.llamaModel.Tensors(ts)...) } -func (p *mixtral) Replacements() []string { +func (p *mixtralModel) Replacements() []string { return append( - p.llama.Replacements(), + p.llamaModel.Replacements(), "block_sparse_moe.gate", "ffn_gate_inp", ) } diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 64d3d012..3de0d404 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -11,8 +11,8 @@ import ( "github.com/ollama/ollama/llm" ) -type phi3 struct { - Parameters +type phi3Model struct { + ModelParameters NumHiddenLayers uint32 `json:"num_hidden_layers"` NLayers uint32 `json:"n_layers"` HiddenSize uint32 `json:"hidden_size"` @@ -35,10 +35,10 @@ type phi3 struct { SlidingWindow uint32 `json:"sliding_window"` } -var _ Converter = (*phi3)(nil) +var _ ModelConverter = (*phi3Model)(nil) -func (p *phi3) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *phi3Model) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "phi3" kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) @@ -68,7 +68,7 @@ func (p *phi3) KV(t *Tokenizer) llm.KV { return kv } -func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { +func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor { var addRopeFactors sync.Once out := make([]llm.Tensor, 0, len(ts)+2) @@ -100,7 +100,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *phi3) Replacements() []string { +func (p *phi3Model) Replacements() []string { return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", diff --git a/convert/convert_test.go b/convert/convert_test.go index 64b7df3b..56b34f22 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -1,7 +1,9 @@ package convert import ( + "bytes" "crypto/sha256" + "encoding/binary" "encoding/hex" "encoding/json" "flag" @@ -29,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) { } defer f.Close() - if err := Convert(fsys, f); err != nil { + if err := ConvertModel(fsys, f); err != nil { t.Fatal(err) } @@ -51,6 +53,34 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) { return r, m.KV(), m.Tensors() } +func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string { + actual := make(map[string]string) + for k, v := range kv { + if s, ok := v.(json.Marshaler); !ok { + actual[k] = fmt.Sprintf("%v", v) + } else { + bts, err := json.Marshal(s) + if err != nil { + t.Fatal(err) + } + + actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts)) + } + } + + for _, tensor := range tensors.Items { + sha256sum := sha256.New() + sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size())) + if _, err := io.Copy(sha256sum, sr); err != nil { + t.Fatal(err) + } + + actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil)) + } + + return actual +} + func TestMain(m *testing.M) { var level slog.Level flag.TextVar(&level, "level", slog.LevelInfo, "log level") @@ -85,29 +115,7 @@ func TestConvertFull(t *testing.T) { } f, kv, tensors := convertFull(t, os.DirFS(p)) - actual := make(map[string]string) - for k, v := range kv { - if s, ok := v.(json.Marshaler); !ok { - actual[k] = fmt.Sprintf("%v", v) - } else { - bts, err := json.Marshal(s) - if err != nil { - t.Fatal(err) - } - - actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts)) - } - } - - for _, tensor := range tensors.Items { - sha256sum := sha256.New() - sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size())) - if _, err := io.Copy(sha256sum, sr); err != nil { - t.Fatal(err) - } - - actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil)) - } + actual := generateResultsJSON(t, f, kv, tensors) expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt))) if err != nil { @@ -131,3 +139,209 @@ func TestConvertFull(t *testing.T) { }) } } + +func TestConvertAdapter(t *testing.T) { + type AdapterCase struct { + Name string + BaseKV map[string]any + Expected map[string]string + } + + cases := []AdapterCase{ + { + Name: "discollama", + BaseKV: map[string]any{ + "general.architecture": "llama", + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(8), + }, + Expected: map[string]string{ + "general.architecture": "llama", + "general.file_type": "1", + "general.parameter_count": "106496", + "general.type": "adapter", + "general.version": "v0.2", + "adapter.lora.alpha": "16", + "adapter.type": "lora", + "llama.attention.head_count": "32", + "llama.attention.head_count_kv": "8", + "blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50", + "blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50", + "blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50", + "blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857", + }, + }, + } + + for _, c := range cases { + t.Run(c.Name, func(t *testing.T) { + t.Parallel() + + f, err := os.CreateTemp(t.TempDir(), "f16") + if err != nil { + t.Fatal(err) + } + defer f.Close() + + tempDir := t.TempDir() + generateLoraTestData(t, tempDir) + + if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil { + t.Fatal(err) + } + + r, err := os.Open(f.Name()) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + m, _, err := llm.DecodeGGML(r, math.MaxInt) + if err != nil { + t.Fatal(err) + } + + if _, err := r.Seek(0, io.SeekStart); err != nil { + t.Fatal(err) + } + + actual := generateResultsJSON(t, r, m.KV(), m.Tensors()) + + keys := maps.Keys(c.Expected) + slices.Sort(keys) + for _, k := range keys { + if v, ok := actual[k]; !ok { + t.Errorf("missing %s", k) + } else if v != c.Expected[k] { + t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v) + } + } + }) + } +} + +func generateLoraTestData(t *testing.T, tempDir string) { + type tensorData struct { + Offsets []int `json:"data_offsets"` + Type string `json:"dtype"` + Shape []int `json:"shape"` + } + offset := 4096 * 8 * 4 + + td := map[string]*tensorData{"__metadata__": nil} + td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{ + Offsets: []int{0, offset}, + Type: "F32", + Shape: []int{4096, 8}, + } + td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{ + Offsets: []int{offset, offset * 2}, + Type: "F32", + Shape: []int{8, 4096}, + } + td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{ + Offsets: []int{offset * 2, offset * 3}, + Type: "F32", + Shape: []int{4096, 8}, + } + td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{ + Offsets: []int{offset * 3, offset*3 + 8*1024*4}, + Type: "F32", + Shape: []int{8, 1024}, + } + + data, err := json.Marshal(td) + if err != nil { + t.Fatal(err) + } + + var buf bytes.Buffer + + l := int64(len(data)) + err = binary.Write(&buf, binary.LittleEndian, l) + if err != nil { + t.Fatal(err) + } + + _, err = buf.Write(data) + if err != nil { + t.Fatal(err) + } + + // write some data for the tensors + + ones := make([]float32, 4096*8) + for i := range ones { + ones[i] = float32(1) + } + + for range 3 { + err = binary.Write(&buf, binary.LittleEndian, ones) + if err != nil { + t.Fatal(err) + } + } + + ones = make([]float32, 1024*8) + for i := range ones { + ones[i] = float32(1) + } + + err = binary.Write(&buf, binary.LittleEndian, ones) + if err != nil { + t.Fatal(err) + } + + fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors")) + if err != nil { + t.Fatal(err) + } + defer fdata.Close() + + _, err = fdata.Write(buf.Bytes()) + if err != nil { + t.Fatal(err) + } + + configData := ` +{ + "adapter_path": "adapters-test", + "batch_size": 8, + "config": "config-tiny.json", + "data": "../discollama-completion", + "grad_checkpoint": null, + "iters": 1000, + "learning_rate": 1e-05, + "lora_layers": 1, + "lora_parameters": { + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "scale": 2.0 + }, + "lr_schedule": null, + "max_seq_length": 2048, + "model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct", + "resume_adapter_file": null, + "save_every": 100, + "seed": 0, + "steps_per_eval": 200, + "steps_per_report": 10, + "test": false, + "test_batches": 500, + "train": true, + "use_dora": false, + "val_batches": 25 +} +` + f, err := os.Create(filepath.Join(tempDir, "adapter_config.json")) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + _, err = f.WriteString(configData) + if err != nil { + t.Fatal(err) + } +} diff --git a/convert/reader.go b/convert/reader.go index 5bba0406..c1218e66 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -64,6 +64,8 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) { }{ {"model-*-of-*.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors}, + {"adapters.safetensors", parseSafetensors}, + {"adapter_model.safetensors", parseSafetensors}, {"pytorch_model-*-of-*.bin", parseTorch}, {"pytorch_model.bin", parseTorch}, {"consolidated.*.pth", parseTorch}, diff --git a/llm/ggml.go b/llm/ggml.go index 4c68adf9..ab436095 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -43,6 +43,14 @@ func (kv KV) Architecture() string { return "unknown" } +func (kv KV) Kind() string { + if s, ok := kv["general.type"].(string); ok { + return s + } + + return "unknown" +} + func (kv KV) ParameterCount() uint64 { return kv.u64("general.parameter_count") } diff --git a/server/images.go b/server/images.go index 8b3a67cf..b5bf7ad6 100644 --- a/server/images.go +++ b/server/images.go @@ -369,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio parameters := make(map[string]any) var layers []Layer + var baseLayers []*layerGGML for _, c := range modelfile.Commands { mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) + command := c.Name - switch c.Name { + switch command { case "model", "adapter": - var baseLayers []*layerGGML - if name := model.ParseName(c.Args); name.IsValid() { + if name := model.ParseName(c.Args); name.IsValid() && command == "model" { baseLayers, err = parseFromModel(ctx, name, fn) if err != nil { return err @@ -409,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio } defer blob.Close() - baseLayers, err = parseFromFile(ctx, blob, digest, fn) + baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn) if err != nil { return err } } else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil { defer file.Close() - baseLayers, err = parseFromFile(ctx, file, "", fn) + baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn) if err != nil { return err } diff --git a/server/model.go b/server/model.go index b17bf0e3..55fb2d8d 100644 --- a/server/model.go +++ b/server/model.go @@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe return layers, nil } -func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { +func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { fi, err := f.Stat() if err != nil { return nil, err @@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api. defer t.Close() defer os.Remove(t.Name()) - fn(api.ProgressResponse{Status: "converting model"}) - if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil { - return nil, err + var layerType string + + switch command { + case "adapter": + var baseModel *llm.GGML + for _, l := range baseLayers { + if l.GGML != nil { + baseModel = l.GGML + break + } + } + + if baseModel == nil { + return nil, fmt.Errorf("no base model specified for the adapter") + } + + if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil { + return nil, err + } + layerType = "application/vnd.ollama.image.adapter" + case "model": + if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil { + return nil, err + } + layerType = "application/vnd.ollama.image.model" } if _, err := t.Seek(0, io.SeekStart); err != nil { return nil, err } - layer, err := NewLayer(t, "application/vnd.ollama.image.model") + layer, err := NewLayer(t, layerType) if err != nil { return nil, err } @@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api. return detectChatTemplate(layers) } -func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { +func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { sr := io.NewSectionReader(file, 0, 512) contentType, err := detectContentType(sr) if err != nil { @@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap case "gguf", "ggla": // noop case "application/zip": - return parseFromZipFile(ctx, file, digest, fn) + return parseFromZipFile(ctx, command, baseLayers, file, digest, fn) default: return nil, fmt.Errorf("unsupported content type: %s", contentType) } @@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap } mediatype := "application/vnd.ollama.image.model" - if ggml.Name() == "ggla" { + if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" { mediatype = "application/vnd.ollama.image.adapter" } else if ggml.KV().Architecture() == "clip" { mediatype = "application/vnd.ollama.image.projector" diff --git a/server/model_test.go b/server/model_test.go index 63fc408d..7753c549 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) { t.Fatalf("failed to seek to start: %v", err) } - layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {}) + layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {}) if err != nil { t.Fatalf("failed to parse from file: %v", err) } @@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) { t.Fatalf("failed to seek to start: %v", err) } - layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {}) + layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {}) if err != nil { t.Fatalf("failed to parse from file: %v", err) } @@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) { t.Fatalf("failed to seek to start: %v", err) } - layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {}) + layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {}) if err != nil { t.Fatalf("failed to parse from file: %v", err) }