From 60e47573a6efef0c9d13a2970a3951d739b9304e Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 27 Aug 2024 11:11:53 -0700 Subject: [PATCH] more tokenizer tests --- convert/tokenizer_test.go | 112 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/convert/tokenizer_test.go b/convert/tokenizer_test.go index ed0175a4..d9550e09 100644 --- a/convert/tokenizer_test.go +++ b/convert/tokenizer_test.go @@ -79,6 +79,118 @@ func TestParseTokenizer(t *testing.T) { Template: "", }, }, + { + name: "added tokens", + fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ + "tokenizer.json": strings.NewReader(`{ + "added_tokens": [ + { + "id": 999, + "content": "", + "special": false + } + ] + }`), + }), + want: &Tokenizer{ + Vocabulary: &Vocabulary{ + Model: "gpt2", + Tokens: []string{""}, + Scores: []float32{999}, + Types: []int32{4}, + }, + Pre: "default", + }, + }, + { + name: "added tokens overlap vocab", + fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ + "tokenizer.json": strings.NewReader(`{ + "added_tokens": [ + { + "id": 0, + "content": "", + "special": true + } + ], + "model": { + "vocab": { + "": 0 + } + } + }`), + }), + want: &Tokenizer{ + Vocabulary: &Vocabulary{ + Model: "gpt2", + Tokens: []string{""}, + Scores: []float32{0}, + Types: []int32{3}, + }, + Pre: "default", + }, + }, + { + name: "special token types", + fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{ + "tokenizer.json": strings.NewReader(`{ + "added_tokens": [ + { + "id": 0, + "content": "", + "special": true + }, + { + "id": 1, + "content": "", + "special": true + }, + { + "id": 2, + "content": "", + "special": true + }, + { + "id": 3, + "content": "", + "special": true + } + ], + "model": { + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3 + } + } + }`), + "tokenizer_config.json": strings.NewReader(`{ + "add_bos_token": true, + "add_eos_token": false, + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "" + }`), + }), + specialTokenTypes: []string{"pad", "eos", "bos", "unk"}, + want: &Tokenizer{ + Vocabulary: &Vocabulary{ + Model: "gpt2", + Tokens: []string{"", "", "", ""}, + Scores: []float32{0, 1, 2, 3}, + Types: []int32{3, 3, 3, 3}, + }, + SpecialVocabulary: []*SpecialVocabulary{ + {Type: "pad", Content: "", ID: 0, AddToken: false}, + {Type: "eos", Content: "", ID: 1, AddToken: false}, + {Type: "bos", Content: "", ID: 2, AddToken: true}, + {Type: "unk", Content: "", ID: 3, AddToken: false}, + }, + Pre: "default", + }, + }, } for _, tt := range cases {