From 2d315ba9a984f8db8f108b967b3af6fa4aa67669 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Wed, 8 May 2024 16:56:18 -0700 Subject: [PATCH] add missing file --- convert/tokenizer.go | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 convert/tokenizer.go diff --git a/convert/tokenizer.go b/convert/tokenizer.go new file mode 100644 index 00000000..a7da81e6 --- /dev/null +++ b/convert/tokenizer.go @@ -0,0 +1,72 @@ +package convert + +import ( + "encoding/json" + "io/ioutil" + "os" +) + +type Tokenizer struct { + Version string `json:"version"` + AddedTokens []Token `json:"added_tokens"` + Model TokenizerModel `json:"model"` +} + +type TokenizerModel struct { + Type string `json:"type"` + Vocab map[string]int `json:"vocab"` + Merges []string `json:"merges"` + Tokens []Token +} + +type Token struct { + ID int `json:"id"` + Content string `json:"content"` + Special bool `json:"special"` + UserDefined bool +} + +func (t *Tokenizer) getMaxID() int { + var maxID int + for _, v := range t.Model.Vocab { + maxID = max(maxID, v) + } + + for _, v := range t.AddedTokens { + maxID = max(maxID, v.ID) + } + return maxID +} + +func newTokenizer(dirpath string) (*Tokenizer, error) { + f, err := os.Open(dirpath) + if err != nil { + panic(err) + } + defer f.Close() + + data, err := ioutil.ReadAll(f) + if err != nil { + return nil, err + } + + var tdata Tokenizer + + if err := json.Unmarshal(data, &tdata); err != nil { + return nil, err + } + + maxID := tdata.getMaxID() + tdata.Model.Tokens = make([]Token, maxID+1) + + for k, v := range tdata.Model.Vocab { + tdata.Model.Tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false} + } + + for _, v := range tdata.AddedTokens { + v.UserDefined = true + tdata.Model.Tokens[v.ID] = v + } + + return &tdata, nil +}