diff --git a/cmd/cmd.go b/cmd/cmd.go index a26f81bf..44481788 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1,6 +1,7 @@ package cmd import ( + "archive/zip" "bytes" "context" "crypto/ed25519" @@ -87,22 +88,82 @@ func CreateHandler(cmd *cobra.Command, args []string) error { path = filepath.Join(filepath.Dir(filename), path) } - bin, err := os.Open(path) + fi, err := os.Stat(path) if errors.Is(err, os.ErrNotExist) && c.Name == "model" { continue } else if err != nil { return err } - defer bin.Close() - hash := sha256.New() - if _, err := io.Copy(hash, bin); err != nil { - return err + // TODO make this work w/ adapters + if fi.IsDir() { + tf, err := os.CreateTemp("", "ollama-tf") + if err != nil { + return err + } + defer os.RemoveAll(tf.Name()) + + zf := zip.NewWriter(tf) + + files, err := filepath.Glob(filepath.Join(path, "model-*.safetensors")) + if err != nil { + return err + } + + if len(files) == 0 { + return fmt.Errorf("no safetensors files were found in '%s'", path) + } + + // add the safetensor config file + tokenizer + files = append(files, filepath.Join(path, "config.json")) + files = append(files, filepath.Join(path, "added_tokens.json")) + files = append(files, filepath.Join(path, "tokenizer.model")) + + for _, fn := range files { + f, err := os.Open(fn) + if os.IsNotExist(err) && strings.HasSuffix(fn, "added_tokens.json") { + continue + } else if err != nil { + return err + } + + fi, err := f.Stat() + if err != nil { + return err + } + + h, err := zip.FileInfoHeader(fi) + if err != nil { + return err + } + + h.Name = filepath.Base(fn) + h.Method = zip.Store + + w, err := zf.CreateHeader(h) + if err != nil { + return err + } + + _, err = io.Copy(w, f) + if err != nil { + return err + } + + } + + if err := zf.Close(); err != nil { + return err + } + + if err := tf.Close(); err != nil { + return err + } + path = tf.Name() } - bin.Seek(0, io.SeekStart) - digest := fmt.Sprintf("sha256:%x", hash.Sum(nil)) - if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil { + digest, err := createBlob(cmd, client, path) + if err != nil { return err } @@ -141,6 +202,26 @@ func CreateHandler(cmd *cobra.Command, args []string) error { return nil } +func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, error) { + bin, err := os.Open(path) + if err != nil { + return "", err + } + defer bin.Close() + + hash := sha256.New() + if _, err := io.Copy(hash, bin); err != nil { + return "", err + } + bin.Seek(0, io.SeekStart) + + digest := fmt.Sprintf("sha256:%x", hash.Sum(nil)) + if err = client.CreateBlob(cmd.Context(), digest, bin); err != nil { + return "", err + } + return digest, nil +} + func RunHandler(cmd *cobra.Command, args []string) error { client, err := api.ClientFromEnvironment() if err != nil { diff --git a/convert/convert.go b/convert/convert.go new file mode 100644 index 00000000..11d7e63a --- /dev/null +++ b/convert/convert.go @@ -0,0 +1,331 @@ +package convert + +import ( + "bytes" + "cmp" + "encoding/binary" + "encoding/json" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "regexp" + "slices" + + "github.com/mitchellh/mapstructure" + "google.golang.org/protobuf/proto" + + "github.com/jmorganca/ollama/convert/sentencepiece" + "github.com/jmorganca/ollama/llm" +) + +type Params struct { + Architectures []string `json:"architectures"` + VocabSize int `json:"vocab_size"` + HiddenSize int `json:"hidden_size"` // n_embd + HiddenLayers int `json:"num_hidden_layers"` // n_layer + ContextSize int `json:"max_position_embeddings"` + IntermediateSize int `json:"intermediate_size"` + AttentionHeads int `json:"num_attention_heads"` // n_head + KeyValHeads int `json:"num_key_value_heads"` + NormEPS float64 `json:"rms_norm_eps"` + RopeFreqBase float64 `json:"rope_theta"` + BoSTokenID int `json:"bos_token_id"` + EoSTokenID int `json:"eos_token_id"` +} + +type MetaData struct { + Type string `mapstructure:"dtype"` + Shape []int `mapstructure:"shape"` + Offsets []int `mapstructure:"data_offsets"` +} + +func ReadSafeTensors(fn string, offset uint64) ([]llm.Tensor, uint64, error) { + f, err := os.Open(fn) + if err != nil { + return []llm.Tensor{}, 0, err + } + defer f.Close() + + var jsonSize uint64 + binary.Read(f, binary.LittleEndian, &jsonSize) + + buf := make([]byte, jsonSize) + _, err = io.ReadFull(f, buf) + if err != nil { + return []llm.Tensor{}, 0, err + } + + d := json.NewDecoder(bytes.NewBuffer(buf)) + d.UseNumber() + var parsed map[string]interface{} + if err = d.Decode(&parsed); err != nil { + return []llm.Tensor{}, 0, err + } + + var keys []string + for k := range parsed { + keys = append(keys, k) + } + + slices.Sort(keys) + + slog.Info("converting layers") + + var tensors []llm.Tensor + for _, k := range keys { + vals := parsed[k].(map[string]interface{}) + var data MetaData + if err = mapstructure.Decode(vals, &data); err != nil { + return []llm.Tensor{}, 0, err + } + + var size uint64 + var kind uint32 + switch len(data.Shape) { + case 0: + // metadata + continue + case 1: + // convert to float32 + kind = 0 + size = uint64(data.Shape[0] * 4) + case 2: + // convert to float16 + kind = 1 + size = uint64(data.Shape[0] * data.Shape[1] * 2) + } + + ggufName, err := GetTensorName(k) + if err != nil { + slog.Error("%v", err) + return []llm.Tensor{}, 0, err + } + + shape := [4]uint64{0, 0, 0, 0} + for cnt, s := range data.Shape { + shape[cnt] = uint64(s) + } + + t := llm.Tensor{ + Name: ggufName, + Kind: kind, + Offset: offset, + Shape: shape, + FileName: fn, + OffsetPadding: 8 + jsonSize, + FileOffsets: []uint64{uint64(data.Offsets[0]), uint64(data.Offsets[1])}, + } + slog.Debug(fmt.Sprintf("%v", t)) + tensors = append(tensors, t) + offset += size + } + return tensors, offset, nil +} + +func GetSafeTensors(dirpath string) ([]llm.Tensor, error) { + var tensors []llm.Tensor + files, err := filepath.Glob(filepath.Join(dirpath, "/model-*.safetensors")) + if err != nil { + return []llm.Tensor{}, err + } + + var offset uint64 + for _, f := range files { + var t []llm.Tensor + var err error + t, offset, err = ReadSafeTensors(f, offset) + if err != nil { + slog.Error("%v", err) + return []llm.Tensor{}, err + } + tensors = append(tensors, t...) + } + return tensors, nil +} + +func GetParams(dirpath string) (*Params, error) { + f, err := os.Open(filepath.Join(dirpath, "config.json")) + if err != nil { + return nil, err + } + defer f.Close() + + var params Params + + d := json.NewDecoder(f) + err = d.Decode(¶ms) + if err != nil { + return nil, err + } + + return ¶ms, nil +} + +// Details on gguf's tokenizer can be found at: +// https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#tokenizer +type Vocab struct { + Tokens []string + Scores []float32 + Types []int32 +} + +func LoadTokens(dirpath string) (*Vocab, error) { + slog.Info(fmt.Sprintf("reading vocab from %s", filepath.Join(dirpath, "tokenizer.model"))) + in, err := os.ReadFile(filepath.Join(dirpath, "tokenizer.model")) + if err != nil { + return nil, err + } + + // To regenerate sentencepiece from the protobufs use: + // protoc -I=./ --go_out=./ sentencepiece_model.proto + modelProto := &sentencepiece.ModelProto{} + if err := proto.Unmarshal(in, modelProto); err != nil { + return nil, err + } + + v := &Vocab{ + Tokens: make([]string, 0), + Scores: make([]float32, 0), + Types: make([]int32, 0), + } + + pieces := modelProto.GetPieces() + for _, p := range pieces { + v.Tokens = append(v.Tokens, p.GetPiece()) + v.Scores = append(v.Scores, p.GetScore()) + t := p.GetType() + v.Types = append(v.Types, int32(t)) + } + + slog.Info(fmt.Sprintf("vocab size: %d", len(v.Tokens))) + + // add any additional tokens + addIn, err := os.ReadFile(filepath.Join(dirpath, "added_tokens.json")) + if os.IsNotExist(err) { + return v, nil + } else if err != nil { + return nil, err + } + + slog.Info("reading user defined tokens") + + var extraTokenData map[string]int + if err := json.Unmarshal(addIn, &extraTokenData); err != nil { + return nil, err + } + + type token struct { + key string + pos int + } + + extraTokens := make([]token, 0) + for k, id := range extraTokenData { + extraTokens = append(extraTokens, token{k, id}) + } + + slices.SortFunc(extraTokens, func(a, b token) int { + return cmp.Compare(a.pos, b.pos) + }) + + numToks := len(v.Tokens) + + for cnt, t := range extraTokens { + // the token id should match the specific index for the total number of tokens + if t.pos != cnt+numToks { + return nil, fmt.Errorf("token ID '%d' for '%s' doesn't match total token size", t.pos, t.key) + } + v.Tokens = append(v.Tokens, t.key) + v.Scores = append(v.Scores, -1000.0) + v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined)) + } + slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens))) + + return v, nil +} + +func GetTensorName(n string) (string, error) { + tMap := map[string]string{ + "model.embed_tokens.weight": "token_embd.weight", + "model.layers.(\\d+).input_layernorm.weight": "blk.$1.attn_norm.weight", + "model.layers.(\\d+).mlp.down_proj.weight": "blk.$1.ffn_down.weight", + "model.layers.(\\d+).mlp.gate_proj.weight": "blk.$1.ffn_gate.weight", + "model.layers.(\\d+).mlp.up_proj.weight": "blk.$1.ffn_up.weight", + "model.layers.(\\d+).post_attention_layernorm.weight": "blk.$1.ffn_norm.weight", + "model.layers.(\\d+).self_attn.k_proj.weight": "blk.$1.attn_k.weight", + "model.layers.(\\d+).self_attn.o_proj.weight": "blk.$1.attn_output.weight", + "model.layers.(\\d+).self_attn.q_proj.weight": "blk.$1.attn_q.weight", + "model.layers.(\\d+).self_attn.v_proj.weight": "blk.$1.attn_v.weight", + "lm_head.weight": "output.weight", + "model.norm.weight": "output_norm.weight", + } + + v, ok := tMap[n] + if ok { + return v, nil + } + + // quick hack to rename the layers to gguf format + for k, v := range tMap { + re := regexp.MustCompile(k) + newName := re.ReplaceAllString(n, v) + if newName != n { + return newName, nil + } + } + + return "", fmt.Errorf("couldn't find a layer name for '%s'", n) +} + +func WriteGGUF(name string, tensors []llm.Tensor, params *Params, vocab *Vocab) (string, error) { + c := llm.ContainerGGUF{ + ByteOrder: binary.LittleEndian, + } + + m := llm.NewGGUFModel(&c) + m.Tensors = tensors + m.KV["general.architecture"] = "llama" + m.KV["general.name"] = name + m.KV["llama.context_length"] = uint32(params.ContextSize) + m.KV["llama.embedding_length"] = uint32(params.HiddenSize) + m.KV["llama.block_count"] = uint32(params.HiddenLayers) + m.KV["llama.feed_forward_length"] = uint32(params.IntermediateSize) + m.KV["llama.rope.dimension_count"] = uint32(128) + m.KV["llama.attention.head_count"] = uint32(params.AttentionHeads) + m.KV["llama.attention.head_count_kv"] = uint32(params.KeyValHeads) + m.KV["llama.attention.layer_norm_rms_epsilon"] = float32(params.NormEPS) + m.KV["llama.rope.freq_base"] = float32(params.RopeFreqBase) + m.KV["general.file_type"] = uint32(1) + m.KV["tokenizer.ggml.model"] = "llama" + + m.KV["tokenizer.ggml.tokens"] = vocab.Tokens + m.KV["tokenizer.ggml.scores"] = vocab.Scores + m.KV["tokenizer.ggml.token_type"] = vocab.Types + + m.KV["tokenizer.ggml.bos_token_id"] = uint32(params.BoSTokenID) + m.KV["tokenizer.ggml.eos_token_id"] = uint32(params.EoSTokenID) + m.KV["tokenizer.ggml.unknown_token_id"] = uint32(0) + m.KV["tokenizer.ggml.add_bos_token"] = true + m.KV["tokenizer.ggml.add_eos_token"] = false + + // llamacpp sets the chat template, however we don't need to set it since we pass it in through a layer + // m.KV["tokenizer.chat_template"] = "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" // XXX removeme + + c.V3.NumTensor = uint64(len(tensors)) + c.V3.NumKV = uint64(len(m.KV)) + + f, err := os.CreateTemp("", "ollama-gguf") + if err != nil { + return "", err + } + defer f.Close() + + err = m.Encode(f) + if err != nil { + return "", err + } + + return f.Name(), nil +} diff --git a/convert/sentencepiece/sentencepiece_model.pb.go b/convert/sentencepiece/sentencepiece_model.pb.go new file mode 100644 index 00000000..5c8db9bc --- /dev/null +++ b/convert/sentencepiece/sentencepiece_model.pb.go @@ -0,0 +1,1497 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.! + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.32.0 +// protoc v4.25.2 +// source: sentencepiece_model.proto + +package sentencepiece + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// Model type. only have UNIGRAM now. +type TrainerSpec_ModelType int32 + +const ( + TrainerSpec_UNIGRAM TrainerSpec_ModelType = 1 // Unigram language model with dynamic algorithm + TrainerSpec_BPE TrainerSpec_ModelType = 2 // Byte Pair Encoding + TrainerSpec_WORD TrainerSpec_ModelType = 3 // Delimitered by whitespace. + TrainerSpec_CHAR TrainerSpec_ModelType = 4 // tokenizes into character sequence +) + +// Enum value maps for TrainerSpec_ModelType. +var ( + TrainerSpec_ModelType_name = map[int32]string{ + 1: "UNIGRAM", + 2: "BPE", + 3: "WORD", + 4: "CHAR", + } + TrainerSpec_ModelType_value = map[string]int32{ + "UNIGRAM": 1, + "BPE": 2, + "WORD": 3, + "CHAR": 4, + } +) + +func (x TrainerSpec_ModelType) Enum() *TrainerSpec_ModelType { + p := new(TrainerSpec_ModelType) + *p = x + return p +} + +func (x TrainerSpec_ModelType) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (TrainerSpec_ModelType) Descriptor() protoreflect.EnumDescriptor { + return file_sentencepiece_model_proto_enumTypes[0].Descriptor() +} + +func (TrainerSpec_ModelType) Type() protoreflect.EnumType { + return &file_sentencepiece_model_proto_enumTypes[0] +} + +func (x TrainerSpec_ModelType) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Do not use. +func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) error { + num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) + if err != nil { + return err + } + *x = TrainerSpec_ModelType(num) + return nil +} + +// Deprecated: Use TrainerSpec_ModelType.Descriptor instead. +func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{0, 0} +} + +type ModelProto_SentencePiece_Type int32 + +const ( + ModelProto_SentencePiece_NORMAL ModelProto_SentencePiece_Type = 1 // normal symbol + ModelProto_SentencePiece_UNKNOWN ModelProto_SentencePiece_Type = 2 // unknown symbol. only for now. + ModelProto_SentencePiece_CONTROL ModelProto_SentencePiece_Type = 3 // control symbols. , , <2ja> etc. + ModelProto_SentencePiece_USER_DEFINED ModelProto_SentencePiece_Type = 4 // user defined symbols. + // Typical usage of USER_DEFINED symbol + // is placeholder. + ModelProto_SentencePiece_BYTE ModelProto_SentencePiece_Type = 6 // byte symbols. Used when `byte_fallback` is true. + ModelProto_SentencePiece_UNUSED ModelProto_SentencePiece_Type = 5 // this piece is not used. +) + +// Enum value maps for ModelProto_SentencePiece_Type. +var ( + ModelProto_SentencePiece_Type_name = map[int32]string{ + 1: "NORMAL", + 2: "UNKNOWN", + 3: "CONTROL", + 4: "USER_DEFINED", + 6: "BYTE", + 5: "UNUSED", + } + ModelProto_SentencePiece_Type_value = map[string]int32{ + "NORMAL": 1, + "UNKNOWN": 2, + "CONTROL": 3, + "USER_DEFINED": 4, + "BYTE": 6, + "UNUSED": 5, + } +) + +func (x ModelProto_SentencePiece_Type) Enum() *ModelProto_SentencePiece_Type { + p := new(ModelProto_SentencePiece_Type) + *p = x + return p +} + +func (x ModelProto_SentencePiece_Type) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ModelProto_SentencePiece_Type) Descriptor() protoreflect.EnumDescriptor { + return file_sentencepiece_model_proto_enumTypes[1].Descriptor() +} + +func (ModelProto_SentencePiece_Type) Type() protoreflect.EnumType { + return &file_sentencepiece_model_proto_enumTypes[1] +} + +func (x ModelProto_SentencePiece_Type) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Do not use. +func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) error { + num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) + if err != nil { + return err + } + *x = ModelProto_SentencePiece_Type(num) + return nil +} + +// Deprecated: Use ModelProto_SentencePiece_Type.Descriptor instead. +func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0, 0} +} + +// TrainerSpec encodes a various parameters for SentencePiece training. +// Next id: 55 +type TrainerSpec struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + // ///////////////////////////////////////////////////////////////// + // General parameters + // + // Input corpus files. + // + // Trainer accepts the following two formats: + // A) Monolingual: plain text, one sentence per line. + // B) Bilingual: TSV, source sentence target sentence + // When bilingual data is passed, shared vocabulary model is built. + // Note that the input file must be raw corpus, not a preprocessed corpus. + // Trainer only loads the first `input_sentence_size` sentences specified + // with this parameter. + Input []string `protobuf:"bytes,1,rep,name=input" json:"input,omitempty"` + // Input corpus format: + // "text": one-sentence-per-line text format (default) + // "tsv": sentence freq + InputFormat *string `protobuf:"bytes,7,opt,name=input_format,json=inputFormat" json:"input_format,omitempty"` + // Output model file prefix. + // .model and .vocab are generated. + ModelPrefix *string `protobuf:"bytes,2,opt,name=model_prefix,json=modelPrefix" json:"model_prefix,omitempty"` + ModelType *TrainerSpec_ModelType `protobuf:"varint,3,opt,name=model_type,json=modelType,enum=sentencepiece.TrainerSpec_ModelType,def=1" json:"model_type,omitempty"` + // Vocabulary size. 8k is the default size. + VocabSize *int32 `protobuf:"varint,4,opt,name=vocab_size,json=vocabSize,def=8000" json:"vocab_size,omitempty"` + // List of the languages this model can accept. + // Since the model is language-agnostic, this field is used as a reference. + AcceptLanguage []string `protobuf:"bytes,5,rep,name=accept_language,json=acceptLanguage" json:"accept_language,omitempty"` + // Size of self-test samples, which are encoded in the model file. + SelfTestSampleSize *int32 `protobuf:"varint,6,opt,name=self_test_sample_size,json=selfTestSampleSize,def=0" json:"self_test_sample_size,omitempty"` + // Whether to use DP version of sentencepiece. Use it with TSV input format + // (requires precomputed word tab counts to work). + EnableDifferentialPrivacy *bool `protobuf:"varint,50,opt,name=enable_differential_privacy,json=enableDifferentialPrivacy,def=0" json:"enable_differential_privacy,omitempty"` + // Set these parameters if you need DP version of sentencepiece. + // std of noise to add. + DifferentialPrivacyNoiseLevel *float32 `protobuf:"fixed32,51,opt,name=differential_privacy_noise_level,json=differentialPrivacyNoiseLevel,def=0" json:"differential_privacy_noise_level,omitempty"` + // Clipping threshold to apply after adding noise. All the words with + // frequency less than this value are dropped. + DifferentialPrivacyClippingThreshold *uint64 `protobuf:"varint,52,opt,name=differential_privacy_clipping_threshold,json=differentialPrivacyClippingThreshold,def=0" json:"differential_privacy_clipping_threshold,omitempty"` + // ///////////////////////////////////////////////////////////////// + // Training parameters. + // + // Uses characters which cover the corpus with the ratio of `chars_coverage`. + // This parameter determines the set of basic Alphabet of sentence piece. + // 1.0 - `chars_coverage` characters are treated as UNK. + // See also required_chars field. + CharacterCoverage *float32 `protobuf:"fixed32,10,opt,name=character_coverage,json=characterCoverage,def=0.9995" json:"character_coverage,omitempty"` + // Maximum size of sentences the trainer loads from `input` parameter. + // Trainer simply loads the `input` files in sequence. + // It is better to shuffle the input corpus randomly. + InputSentenceSize *uint64 `protobuf:"varint,11,opt,name=input_sentence_size,json=inputSentenceSize,def=0" json:"input_sentence_size,omitempty"` + ShuffleInputSentence *bool `protobuf:"varint,19,opt,name=shuffle_input_sentence,json=shuffleInputSentence,def=1" json:"shuffle_input_sentence,omitempty"` + // Maximum size of sentences to make seed sentence pieces. + // Extended suffix array is constructed to extract frequent + // sub-strings from the corpus. This uses 20N working space, + // where N is the size of corpus. + // + // Deprecated: Marked as deprecated in sentencepiece_model.proto. + MiningSentenceSize *int32 `protobuf:"varint,12,opt,name=mining_sentence_size,json=miningSentenceSize" json:"mining_sentence_size,omitempty"` + // Maximum size of sentences to train sentence pieces. + // + // Deprecated: Marked as deprecated in sentencepiece_model.proto. + TrainingSentenceSize *int32 `protobuf:"varint,13,opt,name=training_sentence_size,json=trainingSentenceSize" json:"training_sentence_size,omitempty"` + // The size of seed sentencepieces. + // `seed_sentencepiece_size` must be larger than `vocab_size`. + SeedSentencepieceSize *int32 `protobuf:"varint,14,opt,name=seed_sentencepiece_size,json=seedSentencepieceSize,def=1000000" json:"seed_sentencepiece_size,omitempty"` + // In every EM sub-iterations, keeps top + // `shrinking_factor` * `current sentencepieces size` with respect to + // the loss of the sentence piece. This value should be smaller than 1.0. + ShrinkingFactor *float32 `protobuf:"fixed32,15,opt,name=shrinking_factor,json=shrinkingFactor,def=0.75" json:"shrinking_factor,omitempty"` + // The maximum sentence length in byte. The sentences with the length + // larger than `max_sentence_length` is simply ignored. + // Longer input tends to bring the following risks: + // - Overflow during EM training (unigram language model only) + // - Performance drop because of O(n log n) cost in BPE. + MaxSentenceLength *int32 `protobuf:"varint,18,opt,name=max_sentence_length,json=maxSentenceLength,def=4192" json:"max_sentence_length,omitempty"` + // Number of threads in the training. + NumThreads *int32 `protobuf:"varint,16,opt,name=num_threads,json=numThreads,def=16" json:"num_threads,omitempty"` + // Number of EM sub iterations. + NumSubIterations *int32 `protobuf:"varint,17,opt,name=num_sub_iterations,json=numSubIterations,def=2" json:"num_sub_iterations,omitempty"` + // ///////////////////////////////////////////////////////////////// + // SentencePiece parameters which control the shapes of sentence piece. + // + // Maximum length of sentencepiece. + MaxSentencepieceLength *int32 `protobuf:"varint,20,opt,name=max_sentencepiece_length,json=maxSentencepieceLength,def=16" json:"max_sentencepiece_length,omitempty"` + // Uses Unicode script to split sentence pieces. + // When `split_by_unicode_script` is true, we do not allow sentence piece to + // include multiple Unicode scripts, e.g. "F1" is not a valid piece. + // Exception: CJ characters (Hiragana/Katakana/Han) are all handled + // as one script type, since Japanese word can consist of multiple scripts. + // This exception is always applied regardless of the accept-language + // parameter. + SplitByUnicodeScript *bool `protobuf:"varint,21,opt,name=split_by_unicode_script,json=splitByUnicodeScript,def=1" json:"split_by_unicode_script,omitempty"` + // When `split_by_number` is true, put a boundary between number and + // non-number transition. If we want to treat "F1" is one token, set this flag + // to be false. + SplitByNumber *bool `protobuf:"varint,23,opt,name=split_by_number,json=splitByNumber,def=1" json:"split_by_number,omitempty"` + // Use a white space to split sentence pieces. + // When `split_by_whitespace` is false, we may have the piece containing + // a white space in the middle. e.g., "in_the". + SplitByWhitespace *bool `protobuf:"varint,22,opt,name=split_by_whitespace,json=splitByWhitespace,def=1" json:"split_by_whitespace,omitempty"` + // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => + // hello_. When `treat_whitespace_as_suffix` is true, + // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end + // of sentence. + TreatWhitespaceAsSuffix *bool `protobuf:"varint,24,opt,name=treat_whitespace_as_suffix,json=treatWhitespaceAsSuffix,def=0" json:"treat_whitespace_as_suffix,omitempty"` + // Allows pieces that only contain whitespaces instead of appearing only as + // prefix or suffix of other pieces. + AllowWhitespaceOnlyPieces *bool `protobuf:"varint,26,opt,name=allow_whitespace_only_pieces,json=allowWhitespaceOnlyPieces,def=0" json:"allow_whitespace_only_pieces,omitempty"` + // Split all digits (0-9) into separate pieces. + SplitDigits *bool `protobuf:"varint,25,opt,name=split_digits,json=splitDigits,def=0" json:"split_digits,omitempty"` + // Defines the pre-tokenization delimiter. + // When specified, no pieces crossing this delimiter is not included + // in the vocab. Then the delimiter string is virtually ignored + // during the training. This field can allows constraints on the vocabulary + // selection. Note that this field is available on unigram mode. + PretokenizationDelimiter *string `protobuf:"bytes,53,opt,name=pretokenization_delimiter,json=pretokenizationDelimiter,def=" json:"pretokenization_delimiter,omitempty"` + // ///////////////////////////////////////////////////////////////// + // Vocabulary management + // + // Defines control symbols used as an indicator to + // change the behavior of the decoder. and are pre-defined. + // We can use this field to encode various meta information, + // including language indicator in multilingual model. + // These symbols are not visible to users, but visible to + // the decoder. Note that when the input sentence contains control symbols, + // they are not treated as one token, but segmented into normal pieces. + // Control symbols must be inserted independently from the segmentation. + ControlSymbols []string `protobuf:"bytes,30,rep,name=control_symbols,json=controlSymbols" json:"control_symbols,omitempty"` + // Defines user defined symbols. + // These symbols are added with extremely high score + // so they are always treated as one unique symbol in any context. + // Typical usage of user_defined_symbols is placeholder for named entities. + UserDefinedSymbols []string `protobuf:"bytes,31,rep,name=user_defined_symbols,json=userDefinedSymbols" json:"user_defined_symbols,omitempty"` + // Defines required characters. Each UTF8 character in this string is included + // in the character set regardless of character_coverage value. Unlike + // user_defined_symbols, these characters have scores based on the frequency + // on input sentences, and the model can form subwords using characters + // in this field. + RequiredChars *string `protobuf:"bytes,36,opt,name=required_chars,json=requiredChars" json:"required_chars,omitempty"` + // Decomposes unknown pieces into UTF-8 bytes. + ByteFallback *bool `protobuf:"varint,35,opt,name=byte_fallback,json=byteFallback,def=0" json:"byte_fallback,omitempty"` + // When creating the vocabulary file, defines whether or not to additionally + // output the score for each piece. + VocabularyOutputPieceScore *bool `protobuf:"varint,32,opt,name=vocabulary_output_piece_score,json=vocabularyOutputPieceScore,def=1" json:"vocabulary_output_piece_score,omitempty"` + // `vocab_size` is treated as hard limit. Crash if + // the model can not produce the vocab of size `vocab_size`, + // When `hard_vocab_limit` is false, vocab_size is treated + // as soft limit. Note that when model_type=char, + // always assumes hard_vocab_limit = false. + HardVocabLimit *bool `protobuf:"varint,33,opt,name=hard_vocab_limit,json=hardVocabLimit,def=1" json:"hard_vocab_limit,omitempty"` + // use all symbols for vocab extraction. This flag is valid + // if model type is either CHAR or WORD + UseAllVocab *bool `protobuf:"varint,34,opt,name=use_all_vocab,json=useAllVocab,def=0" json:"use_all_vocab,omitempty"` + // ///////////////////////////////////////////////////////////////// + // Reserved special meta tokens. + // * -1 is not used. + // * unk_id must not be -1. + // Id must starts with 0 and be contigous. + UnkId *int32 `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"` // + BosId *int32 `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"` // + EosId *int32 `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"` // + PadId *int32 `protobuf:"varint,43,opt,name=pad_id,json=padId,def=-1" json:"pad_id,omitempty"` // (padding) + UnkPiece *string `protobuf:"bytes,45,opt,name=unk_piece,json=unkPiece,def=" json:"unk_piece,omitempty"` + BosPiece *string `protobuf:"bytes,46,opt,name=bos_piece,json=bosPiece,def=" json:"bos_piece,omitempty"` + EosPiece *string `protobuf:"bytes,47,opt,name=eos_piece,json=eosPiece,def=" json:"eos_piece,omitempty"` + PadPiece *string `protobuf:"bytes,48,opt,name=pad_piece,json=padPiece,def=" json:"pad_piece,omitempty"` + // Encodes into U+2047 (DOUBLE QUESTION MARK), + // since this character can be useful both for user and + // developer. We can easily figure out that is emitted. + UnkSurface *string `protobuf:"bytes,44,opt,name=unk_surface,json=unkSurface,def= ⁇ " json:"unk_surface,omitempty"` + // Increase bit depth to allow unigram model training on large + // (>10M sentences) corpora. A Side-effect of enabling this flag + // is increased memory usage. + TrainExtremelyLargeCorpus *bool `protobuf:"varint,49,opt,name=train_extremely_large_corpus,json=trainExtremelyLargeCorpus,def=0" json:"train_extremely_large_corpus,omitempty"` + // Path to a seed sentencepieces file, with one tab-separated + // seed sentencepiece frequency per line. + SeedSentencepiecesFile *string `protobuf:"bytes,54,opt,name=seed_sentencepieces_file,json=seedSentencepiecesFile,def=" json:"seed_sentencepieces_file,omitempty"` +} + +// Default values for TrainerSpec fields. +const ( + Default_TrainerSpec_ModelType = TrainerSpec_UNIGRAM + Default_TrainerSpec_VocabSize = int32(8000) + Default_TrainerSpec_SelfTestSampleSize = int32(0) + Default_TrainerSpec_EnableDifferentialPrivacy = bool(false) + Default_TrainerSpec_DifferentialPrivacyNoiseLevel = float32(0) + Default_TrainerSpec_DifferentialPrivacyClippingThreshold = uint64(0) + Default_TrainerSpec_CharacterCoverage = float32(0.9994999766349792) + Default_TrainerSpec_InputSentenceSize = uint64(0) + Default_TrainerSpec_ShuffleInputSentence = bool(true) + Default_TrainerSpec_SeedSentencepieceSize = int32(1000000) + Default_TrainerSpec_ShrinkingFactor = float32(0.75) + Default_TrainerSpec_MaxSentenceLength = int32(4192) + Default_TrainerSpec_NumThreads = int32(16) + Default_TrainerSpec_NumSubIterations = int32(2) + Default_TrainerSpec_MaxSentencepieceLength = int32(16) + Default_TrainerSpec_SplitByUnicodeScript = bool(true) + Default_TrainerSpec_SplitByNumber = bool(true) + Default_TrainerSpec_SplitByWhitespace = bool(true) + Default_TrainerSpec_TreatWhitespaceAsSuffix = bool(false) + Default_TrainerSpec_AllowWhitespaceOnlyPieces = bool(false) + Default_TrainerSpec_SplitDigits = bool(false) + Default_TrainerSpec_PretokenizationDelimiter = string("") + Default_TrainerSpec_ByteFallback = bool(false) + Default_TrainerSpec_VocabularyOutputPieceScore = bool(true) + Default_TrainerSpec_HardVocabLimit = bool(true) + Default_TrainerSpec_UseAllVocab = bool(false) + Default_TrainerSpec_UnkId = int32(0) + Default_TrainerSpec_BosId = int32(1) + Default_TrainerSpec_EosId = int32(2) + Default_TrainerSpec_PadId = int32(-1) + Default_TrainerSpec_UnkPiece = string("") + Default_TrainerSpec_BosPiece = string("") + Default_TrainerSpec_EosPiece = string("") + Default_TrainerSpec_PadPiece = string("") + Default_TrainerSpec_UnkSurface = string(" ⁇ ") + Default_TrainerSpec_TrainExtremelyLargeCorpus = bool(false) + Default_TrainerSpec_SeedSentencepiecesFile = string("") +) + +func (x *TrainerSpec) Reset() { + *x = TrainerSpec{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *TrainerSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*TrainerSpec) ProtoMessage() {} + +func (x *TrainerSpec) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use TrainerSpec.ProtoReflect.Descriptor instead. +func (*TrainerSpec) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{0} +} + +func (x *TrainerSpec) GetInput() []string { + if x != nil { + return x.Input + } + return nil +} + +func (x *TrainerSpec) GetInputFormat() string { + if x != nil && x.InputFormat != nil { + return *x.InputFormat + } + return "" +} + +func (x *TrainerSpec) GetModelPrefix() string { + if x != nil && x.ModelPrefix != nil { + return *x.ModelPrefix + } + return "" +} + +func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType { + if x != nil && x.ModelType != nil { + return *x.ModelType + } + return Default_TrainerSpec_ModelType +} + +func (x *TrainerSpec) GetVocabSize() int32 { + if x != nil && x.VocabSize != nil { + return *x.VocabSize + } + return Default_TrainerSpec_VocabSize +} + +func (x *TrainerSpec) GetAcceptLanguage() []string { + if x != nil { + return x.AcceptLanguage + } + return nil +} + +func (x *TrainerSpec) GetSelfTestSampleSize() int32 { + if x != nil && x.SelfTestSampleSize != nil { + return *x.SelfTestSampleSize + } + return Default_TrainerSpec_SelfTestSampleSize +} + +func (x *TrainerSpec) GetEnableDifferentialPrivacy() bool { + if x != nil && x.EnableDifferentialPrivacy != nil { + return *x.EnableDifferentialPrivacy + } + return Default_TrainerSpec_EnableDifferentialPrivacy +} + +func (x *TrainerSpec) GetDifferentialPrivacyNoiseLevel() float32 { + if x != nil && x.DifferentialPrivacyNoiseLevel != nil { + return *x.DifferentialPrivacyNoiseLevel + } + return Default_TrainerSpec_DifferentialPrivacyNoiseLevel +} + +func (x *TrainerSpec) GetDifferentialPrivacyClippingThreshold() uint64 { + if x != nil && x.DifferentialPrivacyClippingThreshold != nil { + return *x.DifferentialPrivacyClippingThreshold + } + return Default_TrainerSpec_DifferentialPrivacyClippingThreshold +} + +func (x *TrainerSpec) GetCharacterCoverage() float32 { + if x != nil && x.CharacterCoverage != nil { + return *x.CharacterCoverage + } + return Default_TrainerSpec_CharacterCoverage +} + +func (x *TrainerSpec) GetInputSentenceSize() uint64 { + if x != nil && x.InputSentenceSize != nil { + return *x.InputSentenceSize + } + return Default_TrainerSpec_InputSentenceSize +} + +func (x *TrainerSpec) GetShuffleInputSentence() bool { + if x != nil && x.ShuffleInputSentence != nil { + return *x.ShuffleInputSentence + } + return Default_TrainerSpec_ShuffleInputSentence +} + +// Deprecated: Marked as deprecated in sentencepiece_model.proto. +func (x *TrainerSpec) GetMiningSentenceSize() int32 { + if x != nil && x.MiningSentenceSize != nil { + return *x.MiningSentenceSize + } + return 0 +} + +// Deprecated: Marked as deprecated in sentencepiece_model.proto. +func (x *TrainerSpec) GetTrainingSentenceSize() int32 { + if x != nil && x.TrainingSentenceSize != nil { + return *x.TrainingSentenceSize + } + return 0 +} + +func (x *TrainerSpec) GetSeedSentencepieceSize() int32 { + if x != nil && x.SeedSentencepieceSize != nil { + return *x.SeedSentencepieceSize + } + return Default_TrainerSpec_SeedSentencepieceSize +} + +func (x *TrainerSpec) GetShrinkingFactor() float32 { + if x != nil && x.ShrinkingFactor != nil { + return *x.ShrinkingFactor + } + return Default_TrainerSpec_ShrinkingFactor +} + +func (x *TrainerSpec) GetMaxSentenceLength() int32 { + if x != nil && x.MaxSentenceLength != nil { + return *x.MaxSentenceLength + } + return Default_TrainerSpec_MaxSentenceLength +} + +func (x *TrainerSpec) GetNumThreads() int32 { + if x != nil && x.NumThreads != nil { + return *x.NumThreads + } + return Default_TrainerSpec_NumThreads +} + +func (x *TrainerSpec) GetNumSubIterations() int32 { + if x != nil && x.NumSubIterations != nil { + return *x.NumSubIterations + } + return Default_TrainerSpec_NumSubIterations +} + +func (x *TrainerSpec) GetMaxSentencepieceLength() int32 { + if x != nil && x.MaxSentencepieceLength != nil { + return *x.MaxSentencepieceLength + } + return Default_TrainerSpec_MaxSentencepieceLength +} + +func (x *TrainerSpec) GetSplitByUnicodeScript() bool { + if x != nil && x.SplitByUnicodeScript != nil { + return *x.SplitByUnicodeScript + } + return Default_TrainerSpec_SplitByUnicodeScript +} + +func (x *TrainerSpec) GetSplitByNumber() bool { + if x != nil && x.SplitByNumber != nil { + return *x.SplitByNumber + } + return Default_TrainerSpec_SplitByNumber +} + +func (x *TrainerSpec) GetSplitByWhitespace() bool { + if x != nil && x.SplitByWhitespace != nil { + return *x.SplitByWhitespace + } + return Default_TrainerSpec_SplitByWhitespace +} + +func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool { + if x != nil && x.TreatWhitespaceAsSuffix != nil { + return *x.TreatWhitespaceAsSuffix + } + return Default_TrainerSpec_TreatWhitespaceAsSuffix +} + +func (x *TrainerSpec) GetAllowWhitespaceOnlyPieces() bool { + if x != nil && x.AllowWhitespaceOnlyPieces != nil { + return *x.AllowWhitespaceOnlyPieces + } + return Default_TrainerSpec_AllowWhitespaceOnlyPieces +} + +func (x *TrainerSpec) GetSplitDigits() bool { + if x != nil && x.SplitDigits != nil { + return *x.SplitDigits + } + return Default_TrainerSpec_SplitDigits +} + +func (x *TrainerSpec) GetPretokenizationDelimiter() string { + if x != nil && x.PretokenizationDelimiter != nil { + return *x.PretokenizationDelimiter + } + return Default_TrainerSpec_PretokenizationDelimiter +} + +func (x *TrainerSpec) GetControlSymbols() []string { + if x != nil { + return x.ControlSymbols + } + return nil +} + +func (x *TrainerSpec) GetUserDefinedSymbols() []string { + if x != nil { + return x.UserDefinedSymbols + } + return nil +} + +func (x *TrainerSpec) GetRequiredChars() string { + if x != nil && x.RequiredChars != nil { + return *x.RequiredChars + } + return "" +} + +func (x *TrainerSpec) GetByteFallback() bool { + if x != nil && x.ByteFallback != nil { + return *x.ByteFallback + } + return Default_TrainerSpec_ByteFallback +} + +func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool { + if x != nil && x.VocabularyOutputPieceScore != nil { + return *x.VocabularyOutputPieceScore + } + return Default_TrainerSpec_VocabularyOutputPieceScore +} + +func (x *TrainerSpec) GetHardVocabLimit() bool { + if x != nil && x.HardVocabLimit != nil { + return *x.HardVocabLimit + } + return Default_TrainerSpec_HardVocabLimit +} + +func (x *TrainerSpec) GetUseAllVocab() bool { + if x != nil && x.UseAllVocab != nil { + return *x.UseAllVocab + } + return Default_TrainerSpec_UseAllVocab +} + +func (x *TrainerSpec) GetUnkId() int32 { + if x != nil && x.UnkId != nil { + return *x.UnkId + } + return Default_TrainerSpec_UnkId +} + +func (x *TrainerSpec) GetBosId() int32 { + if x != nil && x.BosId != nil { + return *x.BosId + } + return Default_TrainerSpec_BosId +} + +func (x *TrainerSpec) GetEosId() int32 { + if x != nil && x.EosId != nil { + return *x.EosId + } + return Default_TrainerSpec_EosId +} + +func (x *TrainerSpec) GetPadId() int32 { + if x != nil && x.PadId != nil { + return *x.PadId + } + return Default_TrainerSpec_PadId +} + +func (x *TrainerSpec) GetUnkPiece() string { + if x != nil && x.UnkPiece != nil { + return *x.UnkPiece + } + return Default_TrainerSpec_UnkPiece +} + +func (x *TrainerSpec) GetBosPiece() string { + if x != nil && x.BosPiece != nil { + return *x.BosPiece + } + return Default_TrainerSpec_BosPiece +} + +func (x *TrainerSpec) GetEosPiece() string { + if x != nil && x.EosPiece != nil { + return *x.EosPiece + } + return Default_TrainerSpec_EosPiece +} + +func (x *TrainerSpec) GetPadPiece() string { + if x != nil && x.PadPiece != nil { + return *x.PadPiece + } + return Default_TrainerSpec_PadPiece +} + +func (x *TrainerSpec) GetUnkSurface() string { + if x != nil && x.UnkSurface != nil { + return *x.UnkSurface + } + return Default_TrainerSpec_UnkSurface +} + +func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool { + if x != nil && x.TrainExtremelyLargeCorpus != nil { + return *x.TrainExtremelyLargeCorpus + } + return Default_TrainerSpec_TrainExtremelyLargeCorpus +} + +func (x *TrainerSpec) GetSeedSentencepiecesFile() string { + if x != nil && x.SeedSentencepiecesFile != nil { + return *x.SeedSentencepiecesFile + } + return Default_TrainerSpec_SeedSentencepiecesFile +} + +// NormalizerSpec encodes a various parameters for string normalizaiton +type NormalizerSpec struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + // name of normalization rule. + Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"` + // Pre-compiled normalization rule created by + // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. + // Usually this field is set by Builder::GetNormalizerSpec() method. + PrecompiledCharsmap []byte `protobuf:"bytes,2,opt,name=precompiled_charsmap,json=precompiledCharsmap" json:"precompiled_charsmap,omitempty"` + // Adds dummy whitespace at the beginning of text in order to + // treat "world" in "world" and "hello world" in the same way. + AddDummyPrefix *bool `protobuf:"varint,3,opt,name=add_dummy_prefix,json=addDummyPrefix,def=1" json:"add_dummy_prefix,omitempty"` + // Removes leading, trailing, and duplicate internal whitespace. + RemoveExtraWhitespaces *bool `protobuf:"varint,4,opt,name=remove_extra_whitespaces,json=removeExtraWhitespaces,def=1" json:"remove_extra_whitespaces,omitempty"` + // Replaces whitespace with meta symbol. + // This field must be true to train sentence piece model. + EscapeWhitespaces *bool `protobuf:"varint,5,opt,name=escape_whitespaces,json=escapeWhitespaces,def=1" json:"escape_whitespaces,omitempty"` + // Custom normalization rule file in TSV format. + // https://github.com/google/sentencepiece/blob/master/doc/normalization.md + // This field is only used in SentencePieceTrainer::Train() method, which + // compiles the rule into the binary rule stored in `precompiled_charsmap`. + NormalizationRuleTsv *string `protobuf:"bytes,6,opt,name=normalization_rule_tsv,json=normalizationRuleTsv" json:"normalization_rule_tsv,omitempty"` +} + +// Default values for NormalizerSpec fields. +const ( + Default_NormalizerSpec_AddDummyPrefix = bool(true) + Default_NormalizerSpec_RemoveExtraWhitespaces = bool(true) + Default_NormalizerSpec_EscapeWhitespaces = bool(true) +) + +func (x *NormalizerSpec) Reset() { + *x = NormalizerSpec{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *NormalizerSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*NormalizerSpec) ProtoMessage() {} + +func (x *NormalizerSpec) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor instead. +func (*NormalizerSpec) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{1} +} + +func (x *NormalizerSpec) GetName() string { + if x != nil && x.Name != nil { + return *x.Name + } + return "" +} + +func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte { + if x != nil { + return x.PrecompiledCharsmap + } + return nil +} + +func (x *NormalizerSpec) GetAddDummyPrefix() bool { + if x != nil && x.AddDummyPrefix != nil { + return *x.AddDummyPrefix + } + return Default_NormalizerSpec_AddDummyPrefix +} + +func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool { + if x != nil && x.RemoveExtraWhitespaces != nil { + return *x.RemoveExtraWhitespaces + } + return Default_NormalizerSpec_RemoveExtraWhitespaces +} + +func (x *NormalizerSpec) GetEscapeWhitespaces() bool { + if x != nil && x.EscapeWhitespaces != nil { + return *x.EscapeWhitespaces + } + return Default_NormalizerSpec_EscapeWhitespaces +} + +func (x *NormalizerSpec) GetNormalizationRuleTsv() string { + if x != nil && x.NormalizationRuleTsv != nil { + return *x.NormalizationRuleTsv + } + return "" +} + +// Proto to store samples for self-testing. +type SelfTestData struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + Samples []*SelfTestData_Sample `protobuf:"bytes,1,rep,name=samples" json:"samples,omitempty"` +} + +func (x *SelfTestData) Reset() { + *x = SelfTestData{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SelfTestData) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SelfTestData) ProtoMessage() {} + +func (x *SelfTestData) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SelfTestData.ProtoReflect.Descriptor instead. +func (*SelfTestData) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{2} +} + +func (x *SelfTestData) GetSamples() []*SelfTestData_Sample { + if x != nil { + return x.Samples + } + return nil +} + +// ModelProto stores model parameters. +// SentencePieceProcessor is supposed to be self-contained. +// All settings/parameters which may change the behavior must be encoded +// in ModelProto. +type ModelProto struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + // Sentence pieces with scores. + Pieces []*ModelProto_SentencePiece `protobuf:"bytes,1,rep,name=pieces" json:"pieces,omitempty"` + // Spec used to generate this model file. + TrainerSpec *TrainerSpec `protobuf:"bytes,2,opt,name=trainer_spec,json=trainerSpec" json:"trainer_spec,omitempty"` + // Spec for text normalization. + NormalizerSpec *NormalizerSpec `protobuf:"bytes,3,opt,name=normalizer_spec,json=normalizerSpec" json:"normalizer_spec,omitempty"` + // Stores sample input and its expected segmentation to verify the model. + SelfTestData *SelfTestData `protobuf:"bytes,4,opt,name=self_test_data,json=selfTestData" json:"self_test_data,omitempty"` + // Spec for text de-normalization. + DenormalizerSpec *NormalizerSpec `protobuf:"bytes,5,opt,name=denormalizer_spec,json=denormalizerSpec" json:"denormalizer_spec,omitempty"` +} + +func (x *ModelProto) Reset() { + *x = ModelProto{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelProto) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelProto) ProtoMessage() {} + +func (x *ModelProto) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelProto.ProtoReflect.Descriptor instead. +func (*ModelProto) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{3} +} + +func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece { + if x != nil { + return x.Pieces + } + return nil +} + +func (x *ModelProto) GetTrainerSpec() *TrainerSpec { + if x != nil { + return x.TrainerSpec + } + return nil +} + +func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec { + if x != nil { + return x.NormalizerSpec + } + return nil +} + +func (x *ModelProto) GetSelfTestData() *SelfTestData { + if x != nil { + return x.SelfTestData + } + return nil +} + +func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec { + if x != nil { + return x.DenormalizerSpec + } + return nil +} + +type SelfTestData_Sample struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Input *string `protobuf:"bytes,1,opt,name=input" json:"input,omitempty"` + Expected *string `protobuf:"bytes,2,opt,name=expected" json:"expected,omitempty"` +} + +func (x *SelfTestData_Sample) Reset() { + *x = SelfTestData_Sample{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SelfTestData_Sample) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SelfTestData_Sample) ProtoMessage() {} + +func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SelfTestData_Sample.ProtoReflect.Descriptor instead. +func (*SelfTestData_Sample) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{2, 0} +} + +func (x *SelfTestData_Sample) GetInput() string { + if x != nil && x.Input != nil { + return *x.Input + } + return "" +} + +func (x *SelfTestData_Sample) GetExpected() string { + if x != nil && x.Expected != nil { + return *x.Expected + } + return "" +} + +type ModelProto_SentencePiece struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + Piece *string `protobuf:"bytes,1,opt,name=piece" json:"piece,omitempty"` // piece must not be empty. + Score *float32 `protobuf:"fixed32,2,opt,name=score" json:"score,omitempty"` + Type *ModelProto_SentencePiece_Type `protobuf:"varint,3,opt,name=type,enum=sentencepiece.ModelProto_SentencePiece_Type,def=1" json:"type,omitempty"` +} + +// Default values for ModelProto_SentencePiece fields. +const ( + Default_ModelProto_SentencePiece_Type = ModelProto_SentencePiece_NORMAL +) + +func (x *ModelProto_SentencePiece) Reset() { + *x = ModelProto_SentencePiece{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelProto_SentencePiece) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelProto_SentencePiece) ProtoMessage() {} + +func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor instead. +func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0} +} + +func (x *ModelProto_SentencePiece) GetPiece() string { + if x != nil && x.Piece != nil { + return *x.Piece + } + return "" +} + +func (x *ModelProto_SentencePiece) GetScore() float32 { + if x != nil && x.Score != nil { + return *x.Score + } + return 0 +} + +func (x *ModelProto_SentencePiece) GetType() ModelProto_SentencePiece_Type { + if x != nil && x.Type != nil { + return *x.Type + } + return Default_ModelProto_SentencePiece_Type +} + +var File_sentencepiece_model_proto protoreflect.FileDescriptor + +var file_sentencepiece_model_proto_rawDesc = []byte{ + 0x0a, 0x19, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, + 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0d, 0x73, 0x65, 0x6e, + 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x22, 0xc6, 0x12, 0x0a, 0x0b, 0x54, + 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, + 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x03, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x12, 0x21, 0x0a, 0x0c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, + 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x46, 0x6f, 0x72, + 0x6d, 0x61, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x70, 0x72, 0x65, + 0x66, 0x69, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x6d, 0x6f, 0x64, 0x65, 0x6c, + 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x4c, 0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, + 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x24, 0x2e, 0x73, 0x65, 0x6e, + 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x72, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, + 0x3a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 0x4d, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c, + 0x54, 0x79, 0x70, 0x65, 0x12, 0x23, 0x0a, 0x0a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x5f, 0x73, 0x69, + 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x04, 0x38, 0x30, 0x30, 0x30, 0x52, 0x09, + 0x76, 0x6f, 0x63, 0x61, 0x62, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x27, 0x0a, 0x0f, 0x61, 0x63, 0x63, + 0x65, 0x70, 0x74, 0x5f, 0x6c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x18, 0x05, 0x20, 0x03, + 0x28, 0x09, 0x52, 0x0e, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x4c, 0x61, 0x6e, 0x67, 0x75, 0x61, + 0x67, 0x65, 0x12, 0x34, 0x0a, 0x15, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, + 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, + 0x05, 0x3a, 0x01, 0x30, 0x52, 0x12, 0x73, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x53, 0x61, + 0x6d, 0x70, 0x6c, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x45, 0x0a, 0x1b, 0x65, 0x6e, 0x61, 0x62, + 0x6c, 0x65, 0x5f, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x18, 0x32, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, + 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x44, 0x69, 0x66, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x12, + 0x4a, 0x0a, 0x20, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x5f, 0x6e, 0x6f, 0x69, 0x73, 0x65, 0x5f, 0x6c, 0x65, + 0x76, 0x65, 0x6c, 0x18, 0x33, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x01, 0x30, 0x52, 0x1d, 0x64, 0x69, + 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, + 0x79, 0x4e, 0x6f, 0x69, 0x73, 0x65, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, 0x58, 0x0a, 0x27, 0x64, + 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, + 0x61, 0x63, 0x79, 0x5f, 0x63, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x5f, 0x74, 0x68, 0x72, + 0x65, 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x18, 0x34, 0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52, + 0x24, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, + 0x76, 0x61, 0x63, 0x79, 0x43, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x54, 0x68, 0x72, 0x65, + 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x12, 0x35, 0x0a, 0x12, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74, + 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x76, 0x65, 0x72, 0x61, 0x67, 0x65, 0x18, 0x0a, 0x20, 0x01, 0x28, + 0x02, 0x3a, 0x06, 0x30, 0x2e, 0x39, 0x39, 0x39, 0x35, 0x52, 0x11, 0x63, 0x68, 0x61, 0x72, 0x61, + 0x63, 0x74, 0x65, 0x72, 0x43, 0x6f, 0x76, 0x65, 0x72, 0x61, 0x67, 0x65, 0x12, 0x31, 0x0a, 0x13, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, + 0x69, 0x7a, 0x65, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52, 0x11, 0x69, 0x6e, + 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, + 0x3a, 0x0a, 0x16, 0x73, 0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x18, 0x13, 0x20, 0x01, 0x28, 0x08, 0x3a, + 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, 0x73, 0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x49, 0x6e, + 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x12, 0x34, 0x0a, 0x14, 0x6d, + 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, + 0x69, 0x7a, 0x65, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x12, 0x6d, + 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, + 0x65, 0x12, 0x38, 0x0a, 0x16, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, + 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0d, 0x20, 0x01, 0x28, + 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x14, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, + 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x3f, 0x0a, 0x17, 0x73, + 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x07, 0x31, 0x30, + 0x30, 0x30, 0x30, 0x30, 0x30, 0x52, 0x15, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, 0x6e, 0x74, 0x65, + 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x2f, 0x0a, 0x10, + 0x73, 0x68, 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x5f, 0x66, 0x61, 0x63, 0x74, 0x6f, 0x72, + 0x18, 0x0f, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x04, 0x30, 0x2e, 0x37, 0x35, 0x52, 0x0f, 0x73, 0x68, + 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x46, 0x61, 0x63, 0x74, 0x6f, 0x72, 0x12, 0x34, 0x0a, + 0x13, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x6c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x18, 0x12, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x04, 0x34, 0x31, 0x39, 0x32, + 0x52, 0x11, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x4c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x12, 0x23, 0x0a, 0x0b, 0x6e, 0x75, 0x6d, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61, + 0x64, 0x73, 0x18, 0x10, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52, 0x0a, 0x6e, 0x75, + 0x6d, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x12, 0x2f, 0x0a, 0x12, 0x6e, 0x75, 0x6d, 0x5f, + 0x73, 0x75, 0x62, 0x5f, 0x69, 0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x11, + 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x10, 0x6e, 0x75, 0x6d, 0x53, 0x75, 0x62, 0x49, + 0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x3c, 0x0a, 0x18, 0x6d, 0x61, 0x78, + 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x14, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52, + 0x16, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x12, 0x3b, 0x0a, 0x17, 0x73, 0x70, 0x6c, 0x69, 0x74, + 0x5f, 0x62, 0x79, 0x5f, 0x75, 0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x73, 0x63, 0x72, 0x69, + 0x70, 0x74, 0x18, 0x15, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, + 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x55, 0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x53, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x12, 0x2c, 0x0a, 0x0f, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, + 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x17, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, + 0x72, 0x75, 0x65, 0x52, 0x0d, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x4e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x12, 0x34, 0x0a, 0x13, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x77, + 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x18, 0x16, 0x20, 0x01, 0x28, 0x08, 0x3a, + 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x11, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x57, 0x68, + 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x42, 0x0a, 0x1a, 0x74, 0x72, 0x65, 0x61, + 0x74, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x5f, 0x61, 0x73, 0x5f, + 0x73, 0x75, 0x66, 0x66, 0x69, 0x78, 0x18, 0x18, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, + 0x6c, 0x73, 0x65, 0x52, 0x17, 0x74, 0x72, 0x65, 0x61, 0x74, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, + 0x70, 0x61, 0x63, 0x65, 0x41, 0x73, 0x53, 0x75, 0x66, 0x66, 0x69, 0x78, 0x12, 0x46, 0x0a, 0x1c, + 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, + 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x18, 0x1a, 0x20, 0x01, + 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x61, 0x6c, 0x6c, 0x6f, 0x77, + 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x4f, 0x6e, 0x6c, 0x79, 0x50, 0x69, + 0x65, 0x63, 0x65, 0x73, 0x12, 0x28, 0x0a, 0x0c, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x64, 0x69, + 0x67, 0x69, 0x74, 0x73, 0x18, 0x19, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, + 0x65, 0x52, 0x0b, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x69, 0x67, 0x69, 0x74, 0x73, 0x12, 0x3d, + 0x0a, 0x19, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x5f, 0x64, 0x65, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x65, 0x72, 0x18, 0x35, 0x20, 0x01, 0x28, + 0x09, 0x3a, 0x00, 0x52, 0x18, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x44, 0x65, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x65, 0x72, 0x12, 0x27, 0x0a, + 0x0f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, + 0x18, 0x1e, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0e, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x53, + 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x30, 0x0a, 0x14, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x64, + 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x18, 0x1f, + 0x20, 0x03, 0x28, 0x09, 0x52, 0x12, 0x75, 0x73, 0x65, 0x72, 0x44, 0x65, 0x66, 0x69, 0x6e, 0x65, + 0x64, 0x53, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x72, 0x65, 0x71, 0x75, + 0x69, 0x72, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x18, 0x24, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x0d, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x12, + 0x2a, 0x0a, 0x0d, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x66, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, + 0x18, 0x23, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0c, 0x62, + 0x79, 0x74, 0x65, 0x46, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x12, 0x47, 0x0a, 0x1d, 0x76, + 0x6f, 0x63, 0x61, 0x62, 0x75, 0x6c, 0x61, 0x72, 0x79, 0x5f, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x18, 0x20, 0x20, 0x01, + 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x1a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x75, + 0x6c, 0x61, 0x72, 0x79, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x50, 0x69, 0x65, 0x63, 0x65, 0x53, + 0x63, 0x6f, 0x72, 0x65, 0x12, 0x2e, 0x0a, 0x10, 0x68, 0x61, 0x72, 0x64, 0x5f, 0x76, 0x6f, 0x63, + 0x61, 0x62, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x21, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, + 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x68, 0x61, 0x72, 0x64, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x4c, + 0x69, 0x6d, 0x69, 0x74, 0x12, 0x29, 0x0a, 0x0d, 0x75, 0x73, 0x65, 0x5f, 0x61, 0x6c, 0x6c, 0x5f, + 0x76, 0x6f, 0x63, 0x61, 0x62, 0x18, 0x22, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, + 0x73, 0x65, 0x52, 0x0b, 0x75, 0x73, 0x65, 0x41, 0x6c, 0x6c, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x12, + 0x18, 0x0a, 0x06, 0x75, 0x6e, 0x6b, 0x5f, 0x69, 0x64, 0x18, 0x28, 0x20, 0x01, 0x28, 0x05, 0x3a, + 0x01, 0x30, 0x52, 0x05, 0x75, 0x6e, 0x6b, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x62, 0x6f, 0x73, + 0x5f, 0x69, 0x64, 0x18, 0x29, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x31, 0x52, 0x05, 0x62, 0x6f, + 0x73, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x65, 0x6f, 0x73, 0x5f, 0x69, 0x64, 0x18, 0x2a, 0x20, + 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x05, 0x65, 0x6f, 0x73, 0x49, 0x64, 0x12, 0x19, 0x0a, + 0x06, 0x70, 0x61, 0x64, 0x5f, 0x69, 0x64, 0x18, 0x2b, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x2d, + 0x31, 0x52, 0x05, 0x70, 0x61, 0x64, 0x49, 0x64, 0x12, 0x22, 0x0a, 0x09, 0x75, 0x6e, 0x6b, 0x5f, + 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2d, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x75, 0x6e, + 0x6b, 0x3e, 0x52, 0x08, 0x75, 0x6e, 0x6b, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x20, 0x0a, 0x09, + 0x62, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2e, 0x20, 0x01, 0x28, 0x09, 0x3a, + 0x03, 0x3c, 0x73, 0x3e, 0x52, 0x08, 0x62, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x21, + 0x0a, 0x09, 0x65, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2f, 0x20, 0x01, 0x28, + 0x09, 0x3a, 0x04, 0x3c, 0x2f, 0x73, 0x3e, 0x52, 0x08, 0x65, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63, + 0x65, 0x12, 0x22, 0x0a, 0x09, 0x70, 0x61, 0x64, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x30, + 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x70, 0x61, 0x64, 0x3e, 0x52, 0x08, 0x70, 0x61, 0x64, + 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x26, 0x0a, 0x0b, 0x75, 0x6e, 0x6b, 0x5f, 0x73, 0x75, 0x72, + 0x66, 0x61, 0x63, 0x65, 0x18, 0x2c, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x20, 0xe2, 0x81, 0x87, + 0x20, 0x52, 0x0a, 0x75, 0x6e, 0x6b, 0x53, 0x75, 0x72, 0x66, 0x61, 0x63, 0x65, 0x12, 0x46, 0x0a, + 0x1c, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x5f, 0x65, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79, + 0x5f, 0x6c, 0x61, 0x72, 0x67, 0x65, 0x5f, 0x63, 0x6f, 0x72, 0x70, 0x75, 0x73, 0x18, 0x31, 0x20, + 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x74, 0x72, 0x61, 0x69, + 0x6e, 0x45, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79, 0x4c, 0x61, 0x72, 0x67, 0x65, 0x43, + 0x6f, 0x72, 0x70, 0x75, 0x73, 0x12, 0x3a, 0x0a, 0x18, 0x73, 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, + 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x5f, 0x66, 0x69, 0x6c, + 0x65, 0x18, 0x36, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x00, 0x52, 0x16, 0x73, 0x65, 0x65, 0x64, 0x53, + 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x46, 0x69, 0x6c, + 0x65, 0x22, 0x35, 0x0a, 0x09, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, + 0x0a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 0x4d, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x42, + 0x50, 0x45, 0x10, 0x02, 0x12, 0x08, 0x0a, 0x04, 0x57, 0x4f, 0x52, 0x44, 0x10, 0x03, 0x12, 0x08, + 0x0a, 0x04, 0x43, 0x48, 0x41, 0x52, 0x10, 0x04, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, + 0x80, 0x80, 0x02, 0x22, 0xbd, 0x02, 0x0a, 0x0e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, + 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x31, 0x0a, 0x14, 0x70, 0x72, + 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x6d, + 0x61, 0x70, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x13, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, + 0x70, 0x69, 0x6c, 0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x6d, 0x61, 0x70, 0x12, 0x2e, 0x0a, + 0x10, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x75, 0x6d, 0x6d, 0x79, 0x5f, 0x70, 0x72, 0x65, 0x66, 0x69, + 0x78, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x61, + 0x64, 0x64, 0x44, 0x75, 0x6d, 0x6d, 0x79, 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x3e, 0x0a, + 0x18, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x5f, 0x65, 0x78, 0x74, 0x72, 0x61, 0x5f, 0x77, 0x68, + 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x3a, + 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x16, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x78, 0x74, + 0x72, 0x61, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x12, 0x33, 0x0a, + 0x12, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, + 0x63, 0x65, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, + 0x11, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, + 0x65, 0x73, 0x12, 0x34, 0x0a, 0x16, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x5f, 0x74, 0x73, 0x76, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x14, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x52, 0x75, 0x6c, 0x65, 0x54, 0x73, 0x76, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, + 0x80, 0x80, 0x02, 0x22, 0x93, 0x01, 0x0a, 0x0c, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, + 0x44, 0x61, 0x74, 0x61, 0x12, 0x3c, 0x0a, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x18, + 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, + 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, + 0x74, 0x61, 0x2e, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x52, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c, + 0x65, 0x73, 0x1a, 0x3a, 0x0a, 0x06, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x12, 0x14, 0x0a, 0x05, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, + 0x75, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x2a, 0x09, + 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x22, 0xd7, 0x04, 0x0a, 0x0a, 0x4d, 0x6f, + 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x3f, 0x0a, 0x06, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, + 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, + 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, + 0x65, 0x52, 0x06, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x12, 0x3d, 0x0a, 0x0c, 0x74, 0x72, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, + 0x1a, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, + 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x0b, 0x74, 0x72, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x46, 0x0a, 0x0f, 0x6e, 0x6f, 0x72, 0x6d, + 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x0b, 0x32, 0x1d, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x2e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, + 0x52, 0x0e, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, + 0x12, 0x41, 0x0a, 0x0e, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x64, 0x61, + 0x74, 0x61, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, + 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, + 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0c, 0x73, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, + 0x61, 0x74, 0x61, 0x12, 0x4a, 0x0a, 0x11, 0x64, 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, + 0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d, + 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4e, + 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x10, 0x64, + 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x1a, + 0xe6, 0x01, 0x0a, 0x0d, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, + 0x65, 0x12, 0x14, 0x0a, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x02, 0x52, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x12, 0x48, 0x0a, + 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x2c, 0x2e, 0x73, 0x65, + 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65, + 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, + 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x79, 0x70, 0x65, 0x3a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, + 0x4c, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x22, 0x54, 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12, + 0x0a, 0x0a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, 0x4c, 0x10, 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x55, + 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x4f, 0x4e, 0x54, + 0x52, 0x4f, 0x4c, 0x10, 0x03, 0x12, 0x10, 0x0a, 0x0c, 0x55, 0x53, 0x45, 0x52, 0x5f, 0x44, 0x45, + 0x46, 0x49, 0x4e, 0x45, 0x44, 0x10, 0x04, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x59, 0x54, 0x45, 0x10, + 0x06, 0x12, 0x0a, 0x0a, 0x06, 0x55, 0x4e, 0x55, 0x53, 0x45, 0x44, 0x10, 0x05, 0x2a, 0x09, 0x08, + 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, + 0x80, 0x80, 0x02, 0x42, 0x13, 0x48, 0x03, 0x5a, 0x0f, 0x2e, 0x2f, 0x73, 0x65, 0x6e, 0x74, 0x65, + 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, +} + +var ( + file_sentencepiece_model_proto_rawDescOnce sync.Once + file_sentencepiece_model_proto_rawDescData = file_sentencepiece_model_proto_rawDesc +) + +func file_sentencepiece_model_proto_rawDescGZIP() []byte { + file_sentencepiece_model_proto_rawDescOnce.Do(func() { + file_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_sentencepiece_model_proto_rawDescData) + }) + return file_sentencepiece_model_proto_rawDescData +} + +var file_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2) +var file_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6) +var file_sentencepiece_model_proto_goTypes = []interface{}{ + (TrainerSpec_ModelType)(0), // 0: sentencepiece.TrainerSpec.ModelType + (ModelProto_SentencePiece_Type)(0), // 1: sentencepiece.ModelProto.SentencePiece.Type + (*TrainerSpec)(nil), // 2: sentencepiece.TrainerSpec + (*NormalizerSpec)(nil), // 3: sentencepiece.NormalizerSpec + (*SelfTestData)(nil), // 4: sentencepiece.SelfTestData + (*ModelProto)(nil), // 5: sentencepiece.ModelProto + (*SelfTestData_Sample)(nil), // 6: sentencepiece.SelfTestData.Sample + (*ModelProto_SentencePiece)(nil), // 7: sentencepiece.ModelProto.SentencePiece +} +var file_sentencepiece_model_proto_depIdxs = []int32{ + 0, // 0: sentencepiece.TrainerSpec.model_type:type_name -> sentencepiece.TrainerSpec.ModelType + 6, // 1: sentencepiece.SelfTestData.samples:type_name -> sentencepiece.SelfTestData.Sample + 7, // 2: sentencepiece.ModelProto.pieces:type_name -> sentencepiece.ModelProto.SentencePiece + 2, // 3: sentencepiece.ModelProto.trainer_spec:type_name -> sentencepiece.TrainerSpec + 3, // 4: sentencepiece.ModelProto.normalizer_spec:type_name -> sentencepiece.NormalizerSpec + 4, // 5: sentencepiece.ModelProto.self_test_data:type_name -> sentencepiece.SelfTestData + 3, // 6: sentencepiece.ModelProto.denormalizer_spec:type_name -> sentencepiece.NormalizerSpec + 1, // 7: sentencepiece.ModelProto.SentencePiece.type:type_name -> sentencepiece.ModelProto.SentencePiece.Type + 8, // [8:8] is the sub-list for method output_type + 8, // [8:8] is the sub-list for method input_type + 8, // [8:8] is the sub-list for extension type_name + 8, // [8:8] is the sub-list for extension extendee + 0, // [0:8] is the sub-list for field type_name +} + +func init() { file_sentencepiece_model_proto_init() } +func file_sentencepiece_model_proto_init() { + if File_sentencepiece_model_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*TrainerSpec); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*NormalizerSpec); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SelfTestData); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ModelProto); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SelfTestData_Sample); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ModelProto_SentencePiece); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_sentencepiece_model_proto_rawDesc, + NumEnums: 2, + NumMessages: 6, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_sentencepiece_model_proto_goTypes, + DependencyIndexes: file_sentencepiece_model_proto_depIdxs, + EnumInfos: file_sentencepiece_model_proto_enumTypes, + MessageInfos: file_sentencepiece_model_proto_msgTypes, + }.Build() + File_sentencepiece_model_proto = out.File + file_sentencepiece_model_proto_rawDesc = nil + file_sentencepiece_model_proto_goTypes = nil + file_sentencepiece_model_proto_depIdxs = nil +} diff --git a/convert/sentencepiece_model.proto b/convert/sentencepiece_model.proto new file mode 100644 index 00000000..5dc02d6c --- /dev/null +++ b/convert/sentencepiece_model.proto @@ -0,0 +1,333 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.! + +syntax = "proto2"; + +// TODO(taku): Needs to use LITE RUNTIME in OSS release. +option optimize_for = LITE_RUNTIME; +option go_package = "./sentencepiece"; + +package sentencepiece; + +// TrainerSpec encodes a various parameters for SentencePiece training. +// Next id: 55 +message TrainerSpec { + /////////////////////////////////////////////////////////////////// + // General parameters + // + // Input corpus files. + // Trainer accepts the following two formats: + // A) Monolingual: plain text, one sentence per line. + // B) Bilingual: TSV, source sentence target sentence + // When bilingual data is passed, shared vocabulary model is built. + // Note that the input file must be raw corpus, not a preprocessed corpus. + // Trainer only loads the first `input_sentence_size` sentences specified + // with this parameter. + repeated string input = 1; + + // Input corpus format: + // "text": one-sentence-per-line text format (default) + // "tsv": sentence freq + optional string input_format = 7; + + // Output model file prefix. + // .model and .vocab are generated. + optional string model_prefix = 2; + + // Model type. only have UNIGRAM now. + enum ModelType { + UNIGRAM = 1; // Unigram language model with dynamic algorithm + BPE = 2; // Byte Pair Encoding + WORD = 3; // Delimitered by whitespace. + CHAR = 4; // tokenizes into character sequence + } + optional ModelType model_type = 3 [default = UNIGRAM]; + + // Vocabulary size. 8k is the default size. + optional int32 vocab_size = 4 [default = 8000]; + + // List of the languages this model can accept. + // Since the model is language-agnostic, this field is used as a reference. + repeated string accept_language = 5; + + // Size of self-test samples, which are encoded in the model file. + optional int32 self_test_sample_size = 6 [default = 0]; + + // Whether to use DP version of sentencepiece. Use it with TSV input format + // (requires precomputed word tab counts to work). + optional bool enable_differential_privacy = 50 [default = false]; + // Set these parameters if you need DP version of sentencepiece. + // std of noise to add. + optional float differential_privacy_noise_level = 51 [default = 0.0]; + // Clipping threshold to apply after adding noise. All the words with + // frequency less than this value are dropped. + optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; + + /////////////////////////////////////////////////////////////////// + // Training parameters. + // + // Uses characters which cover the corpus with the ratio of `chars_coverage`. + // This parameter determines the set of basic Alphabet of sentence piece. + // 1.0 - `chars_coverage` characters are treated as UNK. + // See also required_chars field. + optional float character_coverage = 10 [default = 0.9995]; + + // Maximum size of sentences the trainer loads from `input` parameter. + // Trainer simply loads the `input` files in sequence. + // It is better to shuffle the input corpus randomly. + optional uint64 input_sentence_size = 11 [default = 0]; + optional bool shuffle_input_sentence = 19 [default = true]; + + // Maximum size of sentences to make seed sentence pieces. + // Extended suffix array is constructed to extract frequent + // sub-strings from the corpus. This uses 20N working space, + // where N is the size of corpus. + optional int32 mining_sentence_size = 12 [deprecated = true]; + + // Maximum size of sentences to train sentence pieces. + optional int32 training_sentence_size = 13 [deprecated = true]; + + // The size of seed sentencepieces. + // `seed_sentencepiece_size` must be larger than `vocab_size`. + optional int32 seed_sentencepiece_size = 14 [default = 1000000]; + + // In every EM sub-iterations, keeps top + // `shrinking_factor` * `current sentencepieces size` with respect to + // the loss of the sentence piece. This value should be smaller than 1.0. + optional float shrinking_factor = 15 [default = 0.75]; + + // The maximum sentence length in byte. The sentences with the length + // larger than `max_sentence_length` is simply ignored. + // Longer input tends to bring the following risks: + // * Overflow during EM training (unigram language model only) + // * Performance drop because of O(n log n) cost in BPE. + optional int32 max_sentence_length = 18 [default = 4192]; + + // Number of threads in the training. + optional int32 num_threads = 16 [default = 16]; + + // Number of EM sub iterations. + optional int32 num_sub_iterations = 17 [default = 2]; + + /////////////////////////////////////////////////////////////////// + // SentencePiece parameters which control the shapes of sentence piece. + // + // Maximum length of sentencepiece. + optional int32 max_sentencepiece_length = 20 [default = 16]; + + // Uses Unicode script to split sentence pieces. + // When `split_by_unicode_script` is true, we do not allow sentence piece to + // include multiple Unicode scripts, e.g. "F1" is not a valid piece. + // Exception: CJ characters (Hiragana/Katakana/Han) are all handled + // as one script type, since Japanese word can consist of multiple scripts. + // This exception is always applied regardless of the accept-language + // parameter. + optional bool split_by_unicode_script = 21 [default = true]; + + // When `split_by_number` is true, put a boundary between number and + // non-number transition. If we want to treat "F1" is one token, set this flag + // to be false. + optional bool split_by_number = 23 [default = true]; + + // Use a white space to split sentence pieces. + // When `split_by_whitespace` is false, we may have the piece containing + // a white space in the middle. e.g., "in_the". + optional bool split_by_whitespace = 22 [default = true]; + + // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => + // hello_. When `treat_whitespace_as_suffix` is true, + // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end + // of sentence. + optional bool treat_whitespace_as_suffix = 24 [default = false]; + + // Allows pieces that only contain whitespaces instead of appearing only as + // prefix or suffix of other pieces. + optional bool allow_whitespace_only_pieces = 26 [default = false]; + + // Split all digits (0-9) into separate pieces. + optional bool split_digits = 25 [default = false]; + + // Defines the pre-tokenization delimiter. + // When specified, no pieces crossing this delimiter is not included + // in the vocab. Then the delimiter string is virtually ignored + // during the training. This field can allows constraints on the vocabulary + // selection. Note that this field is available on unigram mode. + optional string pretokenization_delimiter = 53 [ default = ""]; + + /////////////////////////////////////////////////////////////////// + // Vocabulary management + // + // Defines control symbols used as an indicator to + // change the behavior of the decoder. and are pre-defined. + // We can use this field to encode various meta information, + // including language indicator in multilingual model. + // These symbols are not visible to users, but visible to + // the decoder. Note that when the input sentence contains control symbols, + // they are not treated as one token, but segmented into normal pieces. + // Control symbols must be inserted independently from the segmentation. + repeated string control_symbols = 30; + + // Defines user defined symbols. + // These symbols are added with extremely high score + // so they are always treated as one unique symbol in any context. + // Typical usage of user_defined_symbols is placeholder for named entities. + repeated string user_defined_symbols = 31; + + // Defines required characters. Each UTF8 character in this string is included + // in the character set regardless of character_coverage value. Unlike + // user_defined_symbols, these characters have scores based on the frequency + // on input sentences, and the model can form subwords using characters + // in this field. + optional string required_chars = 36; + + // Decomposes unknown pieces into UTF-8 bytes. + optional bool byte_fallback = 35 [default = false]; + + // When creating the vocabulary file, defines whether or not to additionally + // output the score for each piece. + optional bool vocabulary_output_piece_score = 32 [default = true]; + + // `vocab_size` is treated as hard limit. Crash if + // the model can not produce the vocab of size `vocab_size`, + // When `hard_vocab_limit` is false, vocab_size is treated + // as soft limit. Note that when model_type=char, + // always assumes hard_vocab_limit = false. + optional bool hard_vocab_limit = 33 [default = true]; + + // use all symbols for vocab extraction. This flag is valid + // if model type is either CHAR or WORD + optional bool use_all_vocab = 34 [default = false]; + + /////////////////////////////////////////////////////////////////// + // Reserved special meta tokens. + // * -1 is not used. + // * unk_id must not be -1. + // Id must starts with 0 and be contigous. + optional int32 unk_id = 40 [default = 0]; // + optional int32 bos_id = 41 [default = 1]; // + optional int32 eos_id = 42 [default = 2]; // + optional int32 pad_id = 43 [default = -1]; // (padding) + optional string unk_piece = 45 [default = ""]; + optional string bos_piece = 46 [default = ""]; + optional string eos_piece = 47 [default = ""]; + optional string pad_piece = 48 [default = ""]; + + // Encodes into U+2047 (DOUBLE QUESTION MARK), + // since this character can be useful both for user and + // developer. We can easily figure out that is emitted. + optional string unk_surface = 44 [default = " \xE2\x81\x87 "]; + + // Increase bit depth to allow unigram model training on large + // (>10M sentences) corpora. A Side-effect of enabling this flag + // is increased memory usage. + optional bool train_extremely_large_corpus = 49 [default = false]; + + // Path to a seed sentencepieces file, with one tab-separated + // seed sentencepiece frequency per line. + optional string seed_sentencepieces_file = 54 [default = ""]; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// NormalizerSpec encodes a various parameters for string normalizaiton +message NormalizerSpec { + // name of normalization rule. + optional string name = 1; + + // Pre-compiled normalization rule created by + // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. + // Usually this field is set by Builder::GetNormalizerSpec() method. + optional bytes precompiled_charsmap = 2; + + // Adds dummy whitespace at the beginning of text in order to + // treat "world" in "world" and "hello world" in the same way. + optional bool add_dummy_prefix = 3 [default = true]; + + // Removes leading, trailing, and duplicate internal whitespace. + optional bool remove_extra_whitespaces = 4 [default = true]; + + // Replaces whitespace with meta symbol. + // This field must be true to train sentence piece model. + optional bool escape_whitespaces = 5 [default = true]; + + // Custom normalization rule file in TSV format. + // https://github.com/google/sentencepiece/blob/master/doc/normalization.md + // This field is only used in SentencePieceTrainer::Train() method, which + // compiles the rule into the binary rule stored in `precompiled_charsmap`. + optional string normalization_rule_tsv = 6; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// Proto to store samples for self-testing. +message SelfTestData { + message Sample { + optional string input = 1; + optional string expected = 2; + } + repeated Sample samples = 1; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// ModelProto stores model parameters. +// SentencePieceProcessor is supposed to be self-contained. +// All settings/parameters which may change the behavior must be encoded +// in ModelProto. +message ModelProto { + message SentencePiece { + enum Type { + NORMAL = 1; // normal symbol + UNKNOWN = 2; // unknown symbol. only for now. + CONTROL = 3; // control symbols. , , <2ja> etc. + USER_DEFINED = 4; // user defined symbols. + // Typical usage of USER_DEFINED symbol + // is placeholder. + BYTE = 6; // byte symbols. Used when `byte_fallback` is true. + UNUSED = 5; // this piece is not used. + } + optional string piece = 1; // piece must not be empty. + optional float score = 2; + optional Type type = 3 [default = NORMAL]; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; + } + + // Sentence pieces with scores. + repeated SentencePiece pieces = 1; + + // Spec used to generate this model file. + optional TrainerSpec trainer_spec = 2; + + // Spec for text normalization. + optional NormalizerSpec normalizer_spec = 3; + + // Stores sample input and its expected segmentation to verify the model. + optional SelfTestData self_test_data = 4; + + // Spec for text de-normalization. + optional NormalizerSpec denormalizer_spec = 5; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} diff --git a/go.mod b/go.mod index e0cbfc41..74f75b47 100644 --- a/go.mod +++ b/go.mod @@ -1,23 +1,43 @@ module github.com/jmorganca/ollama -go 1.21 +go 1.22 + +toolchain go1.22.0 require ( github.com/containerd/console v1.0.3 + github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 github.com/emirpasic/gods v1.18.1 github.com/gin-gonic/gin v1.9.1 + github.com/golang/protobuf v1.5.0 github.com/google/uuid v1.0.0 + github.com/mitchellh/mapstructure v1.5.0 github.com/olekukonko/tablewriter v0.0.5 github.com/spf13/cobra v1.7.0 github.com/stretchr/testify v1.8.4 + github.com/x448/float16 v0.8.4 golang.org/x/sync v0.3.0 ) +require github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 + require ( + github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc // indirect + github.com/chewxy/hm v1.0.0 // indirect + github.com/chewxy/math32 v1.0.8 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/flatbuffers v1.12.0 // indirect github.com/mattn/go-runewidth v0.0.14 // indirect + github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rivo/uniseg v0.2.0 // indirect + github.com/xtgo/set v1.0.0 // indirect + go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect + golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect + gonum.org/v1/gonum v0.8.2 // indirect + gorgonia.org/vecf32 v0.9.0 // indirect + gorgonia.org/vecf64 v0.9.0 // indirect ) require ( @@ -38,7 +58,6 @@ require ( github.com/mattn/go-isatty v0.0.19 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect - github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 github.com/pelletier/go-toml/v2 v2.0.8 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect @@ -50,6 +69,6 @@ require ( golang.org/x/sys v0.13.0 golang.org/x/term v0.13.0 golang.org/x/text v0.13.0 // indirect - google.golang.org/protobuf v1.30.0 // indirect + google.golang.org/protobuf v1.30.0 gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 39667adc..d1a75b56 100644 --- a/go.sum +++ b/go.sum @@ -1,18 +1,38 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= +github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc h1:zvQ6w7KwtQWgMQiewOF9tFtundRMVZFSAksNV6ogzuY= +github.com/apache/arrow/go/arrow v0.0.0-20201229220542-30ce2eb5d4dc/go.mod h1:c9sxoIT3YgLxH4UhLOCKaBlEojuMhVYpk4Ntv3opUTQ= github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM= github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s= github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY= github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams= github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk= +github.com/chewxy/hm v1.0.0 h1:zy/TSv3LV2nD3dwUEQL2VhXeoXbb9QkpmdRAVUFiA6k= +github.com/chewxy/hm v1.0.0/go.mod h1:qg9YI4q6Fkj/whwHR1D+bOGeF7SniIP40VweVepLjg0= +github.com/chewxy/math32 v1.0.0/go.mod h1:Miac6hA1ohdDUTagnvJy/q+aNnEk16qWUdb8ZVhvCN0= +github.com/chewxy/math32 v1.0.8 h1:fU5E4Ec4Z+5RtRAi3TovSxUjQPkgRh+HbP7tKB2OFbM= +github.com/chewxy/math32 v1.0.8/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUwkSsLqs= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw= github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 h1:cBzrdJPAFBsgCrDPnZxlp1dF2+k4r1kVpD7+1S1PVjY= +github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1/go.mod h1:uw2gLcxEuYUlAd/EXyjc/v55nd3+47YAgWbSXVxPrNI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU= github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA= github.com/gin-contrib/cors v1.4.0 h1:oJ6gwtUl3lqV0WEIwM/LxPF1QZ5qe2lGWdY2+bz7y0g= @@ -37,7 +57,31 @@ github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QX github.com/goccy/go-json v0.9.7/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/flatbuffers v1.12.0 h1:/PtAHvnBY4Kqnx/xCQ3OIV9uYcSFGScBsWI3Oogeh6w= +github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= @@ -48,6 +92,9 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk= github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY= @@ -68,6 +115,8 @@ github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -75,14 +124,17 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= -github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0= -github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y= +github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9 h1:DV4iXjNn6fGeDl1AkZ1I0QB/0DBjrc7kPpxHrmuDzW4= +github.com/pdevine/tensor v0.0.0-20240228013915-64ccaa8d9ca9/go.mod h1:nR7l3gM6ubiOm+mCkmmUyIBUcBAyiUmW6dQrDZhugFE= github.com/pelletier/go-toml/v2 v2.0.1/go.mod h1:r9LEWfGN8R5k0VXJ+0BkIe7MYkRdwZOjgMj2KwnJFUo= github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ= github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= @@ -96,6 +148,8 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.1.4/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.2.0/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -112,19 +166,61 @@ github.com/ugorji/go v1.2.7/go.mod h1:nF9osbDWLy6bDVv/Rtoh6QgnvNDpmCalQV5urGCCS6 github.com/ugorji/go/codec v1.2.7/go.mod h1:WGN1fab3R1fzQlVQTkfxVtIBhWDRqOviHU95kRgeqEY= github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU= github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/xtgo/set v1.0.0 h1:6BCNBRv3ORNDQ7fyoJXRv+tstJz3m1JVFQErfeZz2pY= +github.com/xtgo/set v1.0.0/go.mod h1:d3NHzGzSa0NmB2NhFyECA+QdRp29oEn2xbT+TpeFoM8= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 h1:lGdhQUN/cnWdSH3291CUuxSEqc+AsGTiDxPP3r2J0l4= +go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6/go.mod h1:FftLjUGFEDu5k8lt0ddY+HcrH/qU/0qk+H8j9/nTl3E= golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k= golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 h1:m64FZMko/V45gv0bNmrNYoDEq8U5YUhetc9cBWKS1TQ= golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63/go.mod h1:0v4NqG35kSWCMzLaMeX+IQrlSnVE/bqGSyC2cz/9Le8= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200904194848-62affa334b73/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -137,12 +233,56 @@ golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +gonum.org/v1/gonum v0.8.2 h1:CCXrcPKiGGotvnN6jfUsKk4rRqm7q09/YbKb5xCEvtM= +gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= +google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f h1:Yv4xsIx7HZOoyUGSJ2ksDyWE2qIBXROsZKt2ny3hCGM= +google.golang.org/genproto v0.0.0-20200911024640-645f7a48b24f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0= +google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/grpc/cmd/protoc-gen-go-grpc v0.0.0-20200910201057-6591123024b3/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= +google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= @@ -157,4 +297,10 @@ gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gorgonia.org/vecf32 v0.9.0 h1:PClazic1r+JVJ1dEzRXgeiVl4g1/Hf/w+wUSqnco1Xg= +gorgonia.org/vecf32 v0.9.0/go.mod h1:NCc+5D2oxddRL11hd+pCB1PEyXWOyiQxfZ/1wwhOXCA= +gorgonia.org/vecf64 v0.9.0 h1:bgZDP5x0OzBF64PjMGC3EvTdOoMEcmfAh1VCUnZFm1A= +gorgonia.org/vecf64 v0.9.0/go.mod h1:hp7IOWCnRiVQKON73kkC/AUMtEXyf9kGlVrtPQ9ccVA= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/llm/ggml.go b/llm/ggml.go index c4fc0a7c..ddcf6ed7 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -163,9 +163,9 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) { case FILE_MAGIC_GGLA: c = &containerLORA{} case FILE_MAGIC_GGUF_LE: - c = &containerGGUF{bo: binary.LittleEndian} + c = &ContainerGGUF{ByteOrder: binary.LittleEndian} case FILE_MAGIC_GGUF_BE: - c = &containerGGUF{bo: binary.BigEndian} + c = &ContainerGGUF{ByteOrder: binary.BigEndian} default: return nil, errors.New("invalid file magic") } diff --git a/llm/gguf.go b/llm/gguf.go index b630b7f4..b01cd5d2 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -5,12 +5,20 @@ import ( "encoding/binary" "fmt" "io" + "log/slog" + "os" + "regexp" + + "github.com/d4l3k/go-bfloat16" + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" + "github.com/x448/float16" "github.com/jmorganca/ollama/format" ) -type containerGGUF struct { - bo binary.ByteOrder +type ContainerGGUF struct { + ByteOrder binary.ByteOrder Version uint32 @@ -23,23 +31,28 @@ type containerGGUF struct { NumTensor uint64 NumKV uint64 } + + V3 struct { + NumTensor uint64 + NumKV uint64 + } } -func (c *containerGGUF) Name() string { +func (c *ContainerGGUF) Name() string { return "gguf" } -func (c *containerGGUF) Decode(rso *readSeekOffset) (model, error) { - binary.Read(rso, c.bo, &c.Version) +func (c *ContainerGGUF) Decode(rso *readSeekOffset) (model, error) { + binary.Read(rso, c.ByteOrder, &c.Version) switch c.Version { case 1: - binary.Read(rso, c.bo, &c.V1) + binary.Read(rso, c.ByteOrder, &c.V1) default: - binary.Read(rso, c.bo, &c.V2) + binary.Read(rso, c.ByteOrder, &c.V2) } - model := newGGUFModel(c) + model := NewGGUFModel(c) if err := model.Decode(rso); err != nil { return nil, err } @@ -48,47 +61,61 @@ func (c *containerGGUF) Decode(rso *readSeekOffset) (model, error) { } const ( - ggufTypeUint8 uint32 = iota - ggufTypeInt8 - ggufTypeUint16 - ggufTypeInt16 - ggufTypeUint32 - ggufTypeInt32 - ggufTypeFloat32 - ggufTypeBool - ggufTypeString - ggufTypeArray - ggufTypeUint64 - ggufTypeInt64 - ggufTypeFloat64 + _ uint32 = iota + GGUFTokenNormal + GGUFTokenUnknown + GGUFTokenControl + GGUFTokenUserDefined + GGUFTokenUnused + GGUFTokenByte ) -type kv map[string]any +const ( + GGUFTypeUint8 uint32 = iota + GGUFTypeInt8 + GGUFTypeUint16 + GGUFTypeInt16 + GGUFTypeUint32 + GGUFTypeInt32 + GGUFTypeFloat32 + GGUFTypeBool + GGUFTypeString + GGUFTypeArray + GGUFTypeUint64 + GGUFTypeInt64 + GGUFTypeFloat64 +) -type tensor struct { - name string - kind uint32 - offset uint64 +type KV map[string]any + +type Tensor struct { + Name string + Kind uint32 + Offset uint64 // shape is the number of elements in each dimension - shape [4]uint64 + Shape [4]uint64 + + FileName string + OffsetPadding uint64 + FileOffsets []uint64 } -func (t tensor) blockSize() uint64 { +func (t Tensor) BlockSize() uint64 { switch { - case t.kind < 2: + case t.Kind < 2: return 1 - case t.kind < 10: + case t.Kind < 10: return 32 default: return 256 } } -func (t tensor) typeSize() uint64 { - blockSize := t.blockSize() +func (t Tensor) TypeSize() uint64 { + blockSize := t.BlockSize() - switch t.kind { + switch t.Kind { case 0: // FP32 return 4 case 1: // FP16 @@ -128,31 +155,63 @@ func (t tensor) typeSize() uint64 { } } -func (t tensor) parameters() uint64 { - return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3] +func (t Tensor) Parameters() uint64 { + return t.Shape[0] * t.Shape[1] * t.Shape[2] * t.Shape[3] } -func (t tensor) size() uint64 { - return t.parameters() * t.typeSize() / t.blockSize() +func (t Tensor) Size() uint64 { + return t.Parameters() * t.TypeSize() / t.BlockSize() } -type ggufModel struct { - *containerGGUF +func (t Tensor) Repack(data []uint16, heads int) ([]uint16, error) { + n := tensor.New(tensor.WithShape(int(t.Shape[0]), int(t.Shape[1])), tensor.WithBacking(data)) + origShape := n.Shape().Clone() - kv - tensors []tensor + // reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf + if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil { + return []uint16{}, err + } + + if err := n.T(0, 2, 1, 3); err != nil { + return []uint16{}, err + } + + if err := n.Reshape(origShape...); err != nil { + return []uint16{}, err + } + + if err := n.Transpose(); err != nil { + return []uint16{}, err + } + newN, err := native.SelectU16(n, 1) + if err != nil { + return []uint16{}, err + } + + var fullTensor []uint16 + for _, v := range newN { + fullTensor = append(fullTensor, v...) + } + return fullTensor, nil +} + +type GGUFModel struct { + *ContainerGGUF + + KV + Tensors []Tensor parameters uint64 } -func newGGUFModel(container *containerGGUF) *ggufModel { - return &ggufModel{ - containerGGUF: container, - kv: make(kv), +func NewGGUFModel(container *ContainerGGUF) *GGUFModel { + return &GGUFModel{ + ContainerGGUF: container, + KV: make(KV), } } -func (llm *ggufModel) NumTensor() uint64 { +func (llm *GGUFModel) NumTensor() uint64 { if llm.Version == 1 { return uint64(llm.V1.NumTensor) } @@ -160,7 +219,7 @@ func (llm *ggufModel) NumTensor() uint64 { return llm.V2.NumTensor } -func (llm *ggufModel) NumKV() uint64 { +func (llm *GGUFModel) NumKV() uint64 { if llm.Version == 1 { return uint64(llm.V1.NumKV) } @@ -168,15 +227,15 @@ func (llm *ggufModel) NumKV() uint64 { return llm.V2.NumKV } -func (llm *ggufModel) ModelFamily() string { - if t, ok := llm.kv["general.architecture"].(string); ok { +func (llm *GGUFModel) ModelFamily() string { + if t, ok := llm.KV["general.architecture"].(string); ok { return t } return "unknown" } -func (llm *ggufModel) ModelType() string { +func (llm *GGUFModel) ModelType() string { if llm.parameters > 0 { return format.HumanNumber(llm.parameters) } @@ -184,15 +243,393 @@ func (llm *ggufModel) ModelType() string { return "unknown" } -func (llm *ggufModel) FileType() string { - if t, ok := llm.kv["general.file_type"].(uint32); ok { +func (llm *GGUFModel) FileType() string { + if t, ok := llm.KV["general.file_type"].(uint32); ok { return fileType(t) } return "unknown" } -func (llm *ggufModel) Decode(rso *readSeekOffset) error { +func (llm *GGUFModel) Encode(f *os.File) error { + // this mimics the order of the llama.cpp convert script + kOrder := []string{ + "general.architecture", + "general.name", + "llama.context_length", + "llama.embedding_length", + "llama.block_count", + "llama.feed_forward_length", + "llama.rope.dimension_count", + "llama.attention.head_count", + "llama.attention.head_count_kv", + "llama.attention.layer_norm_rms_epsilon", + "llama.rope.freq_base", + "general.file_type", + "tokenizer.ggml.model", + "tokenizer.ggml.tokens", + "tokenizer.ggml.scores", + "tokenizer.ggml.token_type", + "tokenizer.ggml.bos_token_id", + "tokenizer.ggml.eos_token_id", + "tokenizer.ggml.unknown_token_id", + "tokenizer.ggml.add_bos_token", + "tokenizer.ggml.add_eos_token", + "tokenizer.chat_template", + } + + if err := binary.Write(f, llm.ByteOrder, []byte("GGUF")); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint32(3)); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint64(llm.V3.NumTensor)); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint64(llm.V3.NumKV)); err != nil { + return err + } + + for _, k := range kOrder { + val, ok := llm.KV[k] + if !ok { + continue + } + + if err := binary.Write(f, llm.ByteOrder, uint64(len(k))); err != nil { + return err + } + if err := binary.Write(f, llm.ByteOrder, []byte(k)); err != nil { + return err + } + + switch v := val.(type) { + case uint32: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeUint32); err != nil { + return err + } + + if err := llm.writeUint32(f, v); err != nil { + return err + } + case float32: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeFloat32); err != nil { + return err + } + + if err := llm.writeF32(f, v); err != nil { + return err + } + case bool: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeBool); err != nil { + return err + } + + if err := llm.writeBool(f, v); err != nil { + return err + } + case string: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeString); err != nil { + return err + } + + if err := llm.writeString(f, v); err != nil { + return err + } + case []int32: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, GGUFTypeInt32); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil { + return err + } + for _, i := range v { + if err := llm.writeInt32(f, i); err != nil { + return err + } + } + case []uint32: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, GGUFTypeUint32); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil { + return err + } + for _, i := range v { + if err := llm.writeUint32(f, i); err != nil { + return err + } + } + case []float32: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, GGUFTypeFloat32); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil { + return err + } + for _, fl := range v { + if err := llm.writeF32(f, fl); err != nil { + return err + } + } + case []string: + if err := binary.Write(f, llm.ByteOrder, GGUFTypeArray); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, GGUFTypeString); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint64(len(v))); err != nil { + return err + } + + for _, s := range v { + if err := llm.writeString(f, s); err != nil { + return err + } + } + } + } + + // write layer metadata + for _, t := range llm.Tensors { + if err := llm.writeString(f, t.Name); err != nil { + return err + } + + // the dimensions of the tensor + dims := 1 + if t.Shape[1] > 0 { + dims = 2 + } + + if err := binary.Write(f, llm.ByteOrder, uint32(dims)); err != nil { + return err + } + + for i := 0; i < dims; i++ { + if err := binary.Write(f, llm.ByteOrder, uint64(t.Shape[dims-1-i])); err != nil { + return err + } + } + + if err := binary.Write(f, llm.ByteOrder, uint32(t.Kind)); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, uint64(t.Offset)); err != nil { + return err + } + } + + offset, terr := f.Seek(0, io.SeekCurrent) + if terr != nil { + return terr + } + slog.Debug(fmt.Sprintf("tensors offset = %x", offset)) + + if err := llm.writePadding(f, 32); err != nil { + return err + } + + var dataFile *os.File + var currentFile string + var err error + for _, t := range llm.Tensors { + if currentFile != t.FileName { + if f != nil { + dataFile.Close() + } + currentFile = t.FileName + dataFile, err = os.Open(t.FileName) + if err != nil { + fmt.Println(err) + return err + } + } + + dataFile.Seek(int64(t.OffsetPadding+t.FileOffsets[0]), 0) + + pattern := `^blk\.[0-9]+\.attn_(?Pq|k)\.weight$` + re, err := regexp.Compile(pattern) + if err != nil { + return err + } + + matches := re.FindAllStringSubmatch(t.Name, -1) + if len(matches) > 0 { + layerSize := t.FileOffsets[1] - t.FileOffsets[0] + + var err error + tData := make([]uint16, layerSize/2) + if err = binary.Read(dataFile, llm.ByteOrder, tData); err != nil { + return err + } + + layerType := matches[0][re.SubexpIndex("layer")] + var heads uint32 + switch layerType { + case "q": + heads = llm.KV["llama.attention.head_count"].(uint32) + case "k": + heads = llm.KV["llama.attention.head_count_kv"].(uint32) + if heads == 0 { + heads = llm.KV["llama.attention.head_count"].(uint32) + } + } + + tData, err = t.Repack(tData, int(heads)) + if err != nil { + return err + } + + var buf []byte + for _, n := range tData { + buf = binary.LittleEndian.AppendUint16(buf, n) + } + + tempBuf := make([]uint16, len(tData)) + tDataF32 := bfloat16.DecodeFloat32(buf) + for cnt, v := range tDataF32 { + tDataF16 := float16.Fromfloat32(v) + tempBuf[cnt] = uint16(tDataF16) + } + + if err = binary.Write(f, llm.ByteOrder, tempBuf); err != nil { + return err + } + + if err := llm.writePadding(f, 32); err != nil { + return err + } + continue + } + + remaining := t.FileOffsets[1] - t.FileOffsets[0] + + bufSize := uint64(10240) + var finished bool + for { + data := make([]byte, min(bufSize, remaining)) + + b, err := io.ReadFull(dataFile, data) + remaining -= uint64(b) + + if err == io.EOF || remaining <= 0 { + finished = true + } else if err != nil { + return err + } + + // convert bfloat16 -> ieee float32 + tDataF32 := bfloat16.DecodeFloat32(data) + + switch t.Kind { + case 0: + if err := binary.Write(f, llm.ByteOrder, tDataF32); err != nil { + return err + } + case 1: + // convert float32 -> float16 + tempBuf := make([]uint16, len(data)/2) + for cnt, v := range tDataF32 { + tDataF16 := float16.Fromfloat32(v) + tempBuf[cnt] = uint16(tDataF16) + } + if err := binary.Write(f, llm.ByteOrder, tempBuf); err != nil { + return err + } + } + if finished { + break + } + } + + if err := llm.writePadding(f, 32); err != nil { + return err + } + } + f.Close() + + return nil +} + +func (llm *GGUFModel) writePadding(f *os.File, align int64) error { + // gguf file padding is defined in https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure + offset, err := f.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + padding := ((offset + align - 1) / align) * align + buf := make([]byte, padding-offset) + if err := binary.Write(f, llm.ByteOrder, buf); err != nil { + return err + } + + return nil +} + +func (llm *GGUFModel) writeInt32(f *os.File, v int32) error { + if err := binary.Write(f, llm.ByteOrder, v); err != nil { + return err + } + return nil +} + +func (llm *GGUFModel) writeUint32(f *os.File, v uint32) error { + if err := binary.Write(f, llm.ByteOrder, v); err != nil { + return err + } + return nil +} + +func (llm *GGUFModel) writeF32(f *os.File, v float32) error { + if err := binary.Write(f, llm.ByteOrder, v); err != nil { + return err + } + return nil +} + +func (llm *GGUFModel) writeBool(f *os.File, b bool) error { + if err := binary.Write(f, llm.ByteOrder, b); err != nil { + return err + } + return nil +} + +func (llm *GGUFModel) writeString(f *os.File, s string) error { + if err := binary.Write(f, llm.ByteOrder, uint64(len(s))); err != nil { + return err + } + + if err := binary.Write(f, llm.ByteOrder, []byte(s)); err != nil { + return err + } + return nil +} + +func (llm *GGUFModel) Decode(rso *readSeekOffset) error { // decode key-values for i := 0; uint64(i) < llm.NumKV(); i++ { k, err := llm.readString(rso) @@ -204,36 +641,36 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error { var v any switch vtype { - case ggufTypeUint8: + case GGUFTypeUint8: v = llm.readU8(rso) - case ggufTypeInt8: + case GGUFTypeInt8: v = llm.readI8(rso) - case ggufTypeUint16: + case GGUFTypeUint16: v = llm.readU16(rso) - case ggufTypeInt16: + case GGUFTypeInt16: v = llm.readI16(rso) - case ggufTypeUint32: + case GGUFTypeUint32: v = llm.readU32(rso) - case ggufTypeInt32: + case GGUFTypeInt32: v = llm.readI32(rso) - case ggufTypeUint64: + case GGUFTypeUint64: v = llm.readU64(rso) - case ggufTypeInt64: + case GGUFTypeInt64: v = llm.readI64(rso) - case ggufTypeFloat32: + case GGUFTypeFloat32: v = llm.readF32(rso) - case ggufTypeFloat64: + case GGUFTypeFloat64: v = llm.readF64(rso) - case ggufTypeBool: + case GGUFTypeBool: v = llm.readBool(rso) - case ggufTypeString: + case GGUFTypeString: s, err := llm.readString(rso) if err != nil { return err } v = s - case ggufTypeArray: + case GGUFTypeArray: a, err := llm.readArray(rso) if err != nil { return err @@ -244,7 +681,7 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error { return fmt.Errorf("invalid type: %d", vtype) } - llm.kv[k] = v + llm.KV[k] = v } // decode tensors @@ -262,33 +699,33 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error { shape[i] = llm.readU64(rso) } - tensor := tensor{ - name: name, - kind: llm.readU32(rso), - offset: llm.readU64(rso), - shape: shape, + tensor := Tensor{ + Name: name, + Kind: llm.readU32(rso), + Offset: llm.readU64(rso), + Shape: shape, } - llm.tensors = append(llm.tensors, tensor) - llm.parameters += tensor.parameters() + llm.Tensors = append(llm.Tensors, tensor) + llm.parameters += tensor.Parameters() } - alignment, ok := llm.kv["general.alignment"].(uint32) + alignment, ok := llm.KV["general.alignment"].(uint32) if !ok { alignment = 32 } rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent) - for _, tensor := range llm.tensors { - padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1) + for _, tensor := range llm.Tensors { + padded := (int64(tensor.Size()) + int64(alignment) - 1) & ^(int64(alignment) - 1) rso.Seek(padded, io.SeekCurrent) } return nil } -func (llm *ggufModel) NumLayers() uint32 { - value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())] +func (llm *GGUFModel) NumLayers() uint32 { + value, exists := llm.KV[fmt.Sprintf("%s.block_count", llm.ModelFamily())] if !exists { return 0 } @@ -296,8 +733,8 @@ func (llm *ggufModel) NumLayers() uint32 { return value.(uint32) } -func (llm *ggufModel) NumHead() uint32 { - value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())] +func (llm *GGUFModel) NumHead() uint32 { + value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())] if !exists { return 0 } @@ -305,8 +742,8 @@ func (llm *ggufModel) NumHead() uint32 { return value.(uint32) } -func (llm *ggufModel) NumEmbed() uint32 { - value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())] +func (llm *GGUFModel) NumEmbed() uint32 { + value, exists := llm.KV[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())] if !exists { return 0 } @@ -314,8 +751,8 @@ func (llm *ggufModel) NumEmbed() uint32 { return value.(uint32) } -func (llm *ggufModel) NumHeadKv() uint32 { - value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())] +func (llm *GGUFModel) NumHeadKv() uint32 { + value, exists := llm.KV[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())] if !exists { return 0 } @@ -323,8 +760,8 @@ func (llm *ggufModel) NumHeadKv() uint32 { return value.(uint32) } -func (llm *ggufModel) NumCtx() uint32 { - value, exists := llm.kv[fmt.Sprintf("%s.context_length", llm.ModelFamily())] +func (llm *GGUFModel) NumCtx() uint32 { + value, exists := llm.KV[fmt.Sprintf("%s.context_length", llm.ModelFamily())] if !exists { return 0 } @@ -332,7 +769,7 @@ func (llm *ggufModel) NumCtx() uint32 { return value.(uint32) } -func (llm *ggufModel) NumGQA() uint32 { +func (llm *GGUFModel) NumGQA() uint32 { numHeadKv := llm.NumHeadKv() if numHeadKv == 0 { return 0 @@ -341,75 +778,75 @@ func (llm *ggufModel) NumGQA() uint32 { return llm.NumHead() / numHeadKv } -func (llm ggufModel) readU8(r io.Reader) uint8 { +func (llm GGUFModel) readU8(r io.Reader) uint8 { var u8 uint8 - binary.Read(r, llm.bo, &u8) + binary.Read(r, llm.ByteOrder, &u8) return u8 } -func (llm ggufModel) readI8(r io.Reader) int8 { +func (llm GGUFModel) readI8(r io.Reader) int8 { var i8 int8 - binary.Read(r, llm.bo, &i8) + binary.Read(r, llm.ByteOrder, &i8) return i8 } -func (llm ggufModel) readU16(r io.Reader) uint16 { +func (llm GGUFModel) readU16(r io.Reader) uint16 { var u16 uint16 - binary.Read(r, llm.bo, &u16) + binary.Read(r, llm.ByteOrder, &u16) return u16 } -func (llm ggufModel) readI16(r io.Reader) int16 { +func (llm GGUFModel) readI16(r io.Reader) int16 { var i16 int16 - binary.Read(r, llm.bo, &i16) + binary.Read(r, llm.ByteOrder, &i16) return i16 } -func (llm ggufModel) readU32(r io.Reader) uint32 { +func (llm GGUFModel) readU32(r io.Reader) uint32 { var u32 uint32 - binary.Read(r, llm.bo, &u32) + binary.Read(r, llm.ByteOrder, &u32) return u32 } -func (llm ggufModel) readI32(r io.Reader) int32 { +func (llm GGUFModel) readI32(r io.Reader) int32 { var i32 int32 - binary.Read(r, llm.bo, &i32) + binary.Read(r, llm.ByteOrder, &i32) return i32 } -func (llm ggufModel) readU64(r io.Reader) uint64 { +func (llm GGUFModel) readU64(r io.Reader) uint64 { var u64 uint64 - binary.Read(r, llm.bo, &u64) + binary.Read(r, llm.ByteOrder, &u64) return u64 } -func (llm ggufModel) readI64(r io.Reader) int64 { +func (llm GGUFModel) readI64(r io.Reader) int64 { var i64 int64 - binary.Read(r, llm.bo, &i64) + binary.Read(r, llm.ByteOrder, &i64) return i64 } -func (llm ggufModel) readF32(r io.Reader) float32 { +func (llm GGUFModel) readF32(r io.Reader) float32 { var f32 float32 - binary.Read(r, llm.bo, &f32) + binary.Read(r, llm.ByteOrder, &f32) return f32 } -func (llm ggufModel) readF64(r io.Reader) float64 { +func (llm GGUFModel) readF64(r io.Reader) float64 { var f64 float64 - binary.Read(r, llm.bo, &f64) + binary.Read(r, llm.ByteOrder, &f64) return f64 } -func (llm ggufModel) readBool(r io.Reader) bool { +func (llm GGUFModel) readBool(r io.Reader) bool { var b bool - binary.Read(r, llm.bo, &b) + binary.Read(r, llm.ByteOrder, &b) return b } -func (llm ggufModel) readStringV1(r io.Reader) (string, error) { +func (llm GGUFModel) readStringV1(r io.Reader) (string, error) { var nameLength uint32 - binary.Read(r, llm.bo, &nameLength) + binary.Read(r, llm.ByteOrder, &nameLength) var b bytes.Buffer if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil { @@ -422,13 +859,13 @@ func (llm ggufModel) readStringV1(r io.Reader) (string, error) { return b.String(), nil } -func (llm ggufModel) readString(r io.Reader) (string, error) { +func (llm GGUFModel) readString(r io.Reader) (string, error) { if llm.Version == 1 { return llm.readStringV1(r) } var nameLength uint64 - binary.Read(r, llm.bo, &nameLength) + binary.Read(r, llm.ByteOrder, &nameLength) var b bytes.Buffer if _, err := io.CopyN(&b, r, int64(nameLength)); err != nil { @@ -438,29 +875,29 @@ func (llm ggufModel) readString(r io.Reader) (string, error) { return b.String(), nil } -func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) { +func (llm *GGUFModel) readArrayV1(r io.Reader) (arr []any, err error) { atype := llm.readU32(r) n := llm.readU32(r) for i := 0; uint32(i) < n; i++ { switch atype { - case ggufTypeUint8: + case GGUFTypeUint8: arr = append(arr, llm.readU8(r)) - case ggufTypeInt8: + case GGUFTypeInt8: arr = append(arr, llm.readI8(r)) - case ggufTypeUint16: + case GGUFTypeUint16: arr = append(arr, llm.readU16(r)) - case ggufTypeInt16: + case GGUFTypeInt16: arr = append(arr, llm.readI16(r)) - case ggufTypeUint32: + case GGUFTypeUint32: arr = append(arr, llm.readU32(r)) - case ggufTypeInt32: + case GGUFTypeInt32: arr = append(arr, llm.readI32(r)) - case ggufTypeFloat32: + case GGUFTypeFloat32: arr = append(arr, llm.readF32(r)) - case ggufTypeBool: + case GGUFTypeBool: arr = append(arr, llm.readBool(r)) - case ggufTypeString: + case GGUFTypeString: s, err := llm.readStringV1(r) if err != nil { return nil, err @@ -475,7 +912,7 @@ func (llm *ggufModel) readArrayV1(r io.Reader) (arr []any, err error) { return } -func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) { +func (llm *GGUFModel) readArray(r io.Reader) (arr []any, err error) { if llm.Version == 1 { return llm.readArrayV1(r) } @@ -485,29 +922,29 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) { for i := 0; uint64(i) < n; i++ { switch atype { - case ggufTypeUint8: + case GGUFTypeUint8: arr = append(arr, llm.readU8(r)) - case ggufTypeInt8: + case GGUFTypeInt8: arr = append(arr, llm.readI8(r)) - case ggufTypeUint16: + case GGUFTypeUint16: arr = append(arr, llm.readU16(r)) - case ggufTypeInt16: + case GGUFTypeInt16: arr = append(arr, llm.readI16(r)) - case ggufTypeUint32: + case GGUFTypeUint32: arr = append(arr, llm.readU32(r)) - case ggufTypeInt32: + case GGUFTypeInt32: arr = append(arr, llm.readI32(r)) - case ggufTypeUint64: + case GGUFTypeUint64: arr = append(arr, llm.readU64(r)) - case ggufTypeInt64: + case GGUFTypeInt64: arr = append(arr, llm.readI64(r)) - case ggufTypeFloat32: + case GGUFTypeFloat32: arr = append(arr, llm.readF32(r)) - case ggufTypeFloat64: + case GGUFTypeFloat64: arr = append(arr, llm.readF64(r)) - case ggufTypeBool: + case GGUFTypeBool: arr = append(arr, llm.readBool(r)) - case ggufTypeString: + case GGUFTypeString: s, err := llm.readString(r) if err != nil { return nil, err diff --git a/server/images.go b/server/images.go index 83ef7920..abd8d780 100644 --- a/server/images.go +++ b/server/images.go @@ -1,6 +1,7 @@ package server import ( + "archive/zip" "bytes" "context" "crypto/sha256" @@ -23,6 +24,7 @@ import ( "golang.org/x/exp/slices" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/convert" "github.com/jmorganca/ollama/llm" "github.com/jmorganca/ollama/parser" "github.com/jmorganca/ollama/version" @@ -316,7 +318,24 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars c.Args = blobPath } - bin, err := os.Open(realpath(modelFileDir, c.Args)) + pathName := realpath(modelFileDir, c.Args) + + ggufName, err := convertSafetensors(name, pathName) + if err != nil { + switch { + case errors.Is(err, zip.ErrFormat): + // it's not a safetensor archive + default: + return err + } + } + + if ggufName != "" { + pathName = ggufName + defer os.RemoveAll(ggufName) + } + + bin, err := os.Open(pathName) if err != nil { // not a file on disk so must be a model reference modelpath := ParseModelPath(c.Args) @@ -592,6 +611,73 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars return nil } +func convertSafetensors(name, fn string) (string, error) { + r, err := zip.OpenReader(fn) + if err != nil { + return "", err + } + defer r.Close() + + tempDir, err := os.MkdirTemp("", "ollama-convert") + if err != nil { + return "", err + } + defer os.RemoveAll(tempDir) + + for _, f := range r.File { + fpath := filepath.Join(tempDir, f.Name) + outFile, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, f.Mode()) + if err != nil { + return "", err + } + + rc, err := f.Open() + if err != nil { + return "", err + } + + _, err = io.Copy(outFile, rc) + if err != nil { + return "", err + } + + outFile.Close() + rc.Close() + } + + params, err := convert.GetParams(tempDir) + if err != nil { + return "", err + } + + SupportedArchs := []string{ + "MistralForCausalLM", + } + + for _, arch := range params.Architectures { + if !slices.Contains(SupportedArchs, arch) { + return "", fmt.Errorf("this safetensors model is not yet supported") + } + } + + t, err := convert.GetSafeTensors(tempDir) + if err != nil { + return "", err + } + + vocab, err := convert.LoadTokens(tempDir) + if err != nil { + return "", err + } + + fn, err = convert.WriteGGUF(name, t, params, vocab) + if err != nil { + return "", err + } + + return fn, nil +} + func CopyModel(src, dest string) error { srcModelPath := ParseModelPath(src) srcPath, err := srcModelPath.GetManifestPath()