From 77903ab8b4fb8075faad7bde5bde2eee3173e407 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 29 Jul 2024 14:53:02 -0700 Subject: [PATCH] llama3.1 --- convert/convert_bert.go | 1 - convert/convert_gemma.go | 1 - convert/convert_gemma2.go | 1 - convert/convert_llama.go | 43 +++++++++++++++++-- convert/convert_phi3.go | 1 - convert/convert_test.go | 1 + .../testdata/Meta-Llama-3.1-8B-Instruct.json | 3 ++ llm/memory_test.go | 1 - server/sched_test.go | 1 - 9 files changed, 44 insertions(+), 9 deletions(-) create mode 100644 convert/testdata/Meta-Llama-3.1-8B-Instruct.json diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 4547a705..6e7d59fe 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -88,7 +88,6 @@ func (p *bert) parseMore(fsys fs.FS) error { func (p *bert) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "bert" - kv["general.name"] = "bert" kv["bert.attention.causal"] = false kv["bert.pooling_type"] = p.PoolingType diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 333e4c83..c4316808 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -26,7 +26,6 @@ var _ Converter = (*gemma)(nil) func (p *gemma) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma" - kv["general.name"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.embedding_length"] = p.HiddenSize kv["gemma.block_count"] = p.HiddenLayers diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go index 66be02d6..084f9c52 100644 --- a/convert/convert_gemma2.go +++ b/convert/convert_gemma2.go @@ -14,7 +14,6 @@ type gemma2 struct { func (p *gemma2) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma2" - kv["general.name"] = "gemma2" kv["gemma2.context_length"] = p.MaxPositionEmbeddings kv["gemma2.embedding_length"] = p.HiddenSize kv["gemma2.block_count"] = p.HiddenLayers diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 498d1321..27f924fb 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -3,6 +3,7 @@ package convert import ( "cmp" "fmt" + "math" "strings" "github.com/pdevine/tensor" @@ -27,8 +28,14 @@ type llama struct { NumKeyValueHeads uint32 `json:"num_key_value_heads"` RopeTheta float32 `json:"rope_theta"` RopeScaling struct { - Type string `json:"type"` - Factor float32 `json:"factor"` + Type string `json:"type"` + RopeType string `json:"rope_type"` + Factor float32 `json:"factor"` + LowFrequencyFactor float32 `json:"low_freq_factor"` + HighFrequencyFactor float32 `json:"high_freq_factor"` + OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"` + + factors ropeFactor } `json:"rope_scaling"` RMSNormEPS float32 `json:"rms_norm_eps"` LayerNormEPS float32 `json:"layer_norm_eps"` @@ -42,7 +49,6 @@ var _ Converter = (*llama)(nil) func (p *llama) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "llama" - kv["general.name"] = "llama" kv["llama.vocab_size"] = p.VocabSize kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) @@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV { if p.RopeScaling.Type == "linear" { kv["llama.rope.scaling.type"] = p.RopeScaling.Type kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor + } else if p.RopeScaling.RopeType == "llama3" { + dim := p.HiddenSize / p.NumAttentionHeads + for i := uint32(0); i < dim; i += 2 { + factor := cmp.Or(p.RopeScaling.Factor, 8.0) + factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0) + factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0) + + original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192) + lambdaLow := float32(original) / factorLow + lambdaHigh := float32(original) / factorHigh + + lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim)) + if lambda < float64(lambdaHigh) { + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0) + } else if lambda > float64(lambdaLow) { + p.RopeScaling.factors = append(p.RopeScaling.factors, factor) + } else { + smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow) + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth)) + } + } } if p.NumKeyValueHeads > 0 { @@ -95,6 +122,16 @@ func (p *llama) KV(t *Tokenizer) llm.KV { func (p *llama) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor + + if p.RopeScaling.factors != nil { + out = append(out, llm.Tensor{ + Name: "rope_freqs.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.factors))}, + WriterTo: p.RopeScaling.factors, + }) + } + for _, t := range ts { if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") { diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 4ee59ff5..64d3d012 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -40,7 +40,6 @@ var _ Converter = (*phi3)(nil) func (p *phi3) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "phi3" - kv["general.name"] = "phi3" kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) kv["phi3.feed_forward_length"] = p.IntermediateSize diff --git a/convert/convert_test.go b/convert/convert_test.go index e78afab7..64b7df3b 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -62,6 +62,7 @@ func TestMain(m *testing.M) { func TestConvertFull(t *testing.T) { cases := []string{ "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3.1-8B-Instruct", "Mistral-7B-Instruct-v0.2", "Mixtral-8x7B-Instruct-v0.1", "gemma-2b-it", diff --git a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json new file mode 100644 index 00000000..ad7cd20a --- /dev/null +++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json @@ -0,0 +1,3 @@ +{ + "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222" +} diff --git a/llm/memory_test.go b/llm/memory_test.go index 6cf0119f..ffb14286 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) { assert.Len(t, tensors, inputLayerCount+1) err = WriteGGUF(f, KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(inputLayerCount), diff --git a/server/sched_test.go b/server/sched_test.go index 713b9259..fb049574 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est require.NoError(t, llm.WriteGGUF(f, llm.KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(1),