From da74384a3e6502a0d263c55d34a721970afccafc Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Thu, 6 Jul 2023 17:49:05 -0400 Subject: [PATCH] remove prompt cache --- llama/binding/binding.cpp | 11 +++-------- llama/binding/binding.h | 5 ++--- llama/llama.go | 14 ++++---------- llama/options.go | 23 +++-------------------- 4 files changed, 12 insertions(+), 41 deletions(-) diff --git a/llama/binding/binding.cpp b/llama/binding/binding.cpp index 52acc9ea..314e5ac1 100644 --- a/llama/binding/binding.cpp +++ b/llama/binding/binding.cpp @@ -24,7 +24,7 @@ #include #endif -#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \ +#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \ defined(_WIN32) void sigint_handler(int signo) { if (signo == SIGINT) { @@ -573,15 +573,13 @@ void *llama_allocate_params( const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, - const char *logit_bias, const char *session_file, bool prompt_cache_all, - bool mlock, bool mmap, const char *maingpu, const char *tensorsplit, - bool prompt_cache_ro) { + const char *logit_bias, bool mlock, bool mmap, const char *maingpu, + const char *tensorsplit) { gpt_params *params = new gpt_params; params->seed = seed; params->n_threads = threads; params->n_predict = tokens; params->repeat_last_n = repeat_last_n; - params->prompt_cache_ro = prompt_cache_ro; params->top_k = top_k; params->top_p = top_p; params->memory_f16 = memory_f16; @@ -612,9 +610,6 @@ void *llama_allocate_params( } } - params->prompt_cache_all = prompt_cache_all; - params->path_prompt_cache = session_file; - if (ignore_eos) { params->logit_bias[llama_token_eos()] = -INFINITY; } diff --git a/llama/binding/binding.h b/llama/binding/binding.h index eb04d00c..392d53ab 100644 --- a/llama/binding/binding.h +++ b/llama/binding/binding.h @@ -31,9 +31,8 @@ void *llama_allocate_params( const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p, float frequency_penalty, float presence_penalty, int mirostat, float mirostat_eta, float mirostat_tau, bool penalize_nl, - const char *logit_bias, const char *session_file, bool prompt_cache_all, - bool mlock, bool mmap, const char *maingpu, const char *tensorsplit, - bool prompt_cache_ro); + const char *logit_bias, bool mlock, bool mmap, const char *maingpu, + const char *tensorsplit); void llama_free_params(void *params_ptr); diff --git a/llama/llama.go b/llama/llama.go index e5804e1f..073bae94 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -28,6 +28,7 @@ package llama // #include "binding/binding.h" // #include import "C" + import ( "fmt" "strings" @@ -69,7 +70,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error { po.Tokens = 99999999 } defer C.free(unsafe.Pointer(input)) - + reverseCount := len(po.StopPrompts) reversePrompt := make([]*C.char, reverseCount) var pass **C.char @@ -86,9 +87,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error { C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias), - C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), - C.CString(po.MainGPU), C.CString(po.TensorSplit), - C.bool(po.PromptCacheRO), + C.bool(po.MLock), C.bool(po.MMap), C.CString(po.MainGPU), C.CString(po.TensorSplit), ) defer C.llama_free_params(params) @@ -128,9 +127,6 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { cLogitBias := C.CString(po.LogitBias) defer C.free(unsafe.Pointer(cLogitBias)) - cPathPromptCache := C.CString(po.PathPromptCache) - defer C.free(unsafe.Pointer(cPathPromptCache)) - cMainGPU := C.CString(po.MainGPU) defer C.free(unsafe.Pointer(cMainGPU)) @@ -143,9 +139,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount), C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty), C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias, - cPathPromptCache, C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap), - cMainGPU, cTensorSplit, - C.bool(po.PromptCacheRO), + C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit, ) defer C.llama_free_params(params) diff --git a/llama/options.go b/llama/options.go index 3cc72a53..ca589cc3 100644 --- a/llama/options.go +++ b/llama/options.go @@ -57,11 +57,9 @@ type PredictOptions struct { LogitBias string TokenCallback func(string) bool - PathPromptCache string - MLock, MMap, PromptCacheAll bool - PromptCacheRO bool - MainGPU string - TensorSplit string + MLock, MMap bool + MainGPU string + TensorSplit string } type PredictOption func(p *PredictOptions) @@ -182,14 +180,6 @@ var Debug PredictOption = func(p *PredictOptions) { p.DebugMode = true } -var EnablePromptCacheAll PredictOption = func(p *PredictOptions) { - p.PromptCacheAll = true -} - -var EnablePromptCacheRO PredictOption = func(p *PredictOptions) { - p.PromptCacheRO = true -} - var EnableMLock ModelOption = func(p *ModelOptions) { p.MLock = true } @@ -284,13 +274,6 @@ func SetTemperature(temp float64) PredictOption { } } -// SetPathPromptCache sets the session file to store the prompt cache. -func SetPathPromptCache(f string) PredictOption { - return func(p *PredictOptions) { - p.PathPromptCache = f - } -} - // SetPenalty sets the repetition penalty for text generation. func SetPenalty(penalty float64) PredictOption { return func(p *PredictOptions) {