From da74384a3e6502a0d263c55d34a721970afccafc Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Thu, 6 Jul 2023 17:49:05 -0400
Subject: [PATCH] remove prompt cache

---
 llama/binding/binding.cpp | 11 +++--------
 llama/binding/binding.h   |  5 ++---
 llama/llama.go            | 14 ++++----------
 llama/options.go          | 23 +++--------------------
 4 files changed, 12 insertions(+), 41 deletions(-)

diff --git a/llama/binding/binding.cpp b/llama/binding/binding.cpp
index 52acc9ea..314e5ac1 100644
--- a/llama/binding/binding.cpp
+++ b/llama/binding/binding.cpp
@@ -24,7 +24,7 @@
 #include <windows.h>
 #endif
 
-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) ||          \
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \
     defined(_WIN32)
 void sigint_handler(int signo) {
   if (signo == SIGINT) {
@@ -573,15 +573,13 @@ void *llama_allocate_params(
     const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
     float frequency_penalty, float presence_penalty, int mirostat,
     float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, const char *session_file, bool prompt_cache_all,
-    bool mlock, bool mmap, const char *maingpu, const char *tensorsplit,
-    bool prompt_cache_ro) {
+    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
+    const char *tensorsplit) {
   gpt_params *params = new gpt_params;
   params->seed = seed;
   params->n_threads = threads;
   params->n_predict = tokens;
   params->repeat_last_n = repeat_last_n;
-  params->prompt_cache_ro = prompt_cache_ro;
   params->top_k = top_k;
   params->top_p = top_p;
   params->memory_f16 = memory_f16;
@@ -612,9 +610,6 @@ void *llama_allocate_params(
     }
   }
 
-  params->prompt_cache_all = prompt_cache_all;
-  params->path_prompt_cache = session_file;
-
   if (ignore_eos) {
     params->logit_bias[llama_token_eos()] = -INFINITY;
   }
diff --git a/llama/binding/binding.h b/llama/binding/binding.h
index eb04d00c..392d53ab 100644
--- a/llama/binding/binding.h
+++ b/llama/binding/binding.h
@@ -31,9 +31,8 @@ void *llama_allocate_params(
     const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
     float frequency_penalty, float presence_penalty, int mirostat,
     float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, const char *session_file, bool prompt_cache_all,
-    bool mlock, bool mmap, const char *maingpu, const char *tensorsplit,
-    bool prompt_cache_ro);
+    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
+    const char *tensorsplit);
 
 void llama_free_params(void *params_ptr);
 
diff --git a/llama/llama.go b/llama/llama.go
index e5804e1f..073bae94 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -28,6 +28,7 @@ package llama
 // #include "binding/binding.h"
 // #include <stdlib.h>
 import "C"
+
 import (
 	"fmt"
 	"strings"
@@ -69,7 +70,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
 		po.Tokens = 99999999
 	}
 	defer C.free(unsafe.Pointer(input))
-	
+
 	reverseCount := len(po.StopPrompts)
 	reversePrompt := make([]*C.char, reverseCount)
 	var pass **C.char
@@ -86,9 +87,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
-		C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
-		C.CString(po.MainGPU), C.CString(po.TensorSplit),
-		C.bool(po.PromptCacheRO),
+		C.bool(po.MLock), C.bool(po.MMap), C.CString(po.MainGPU), C.CString(po.TensorSplit),
 	)
 	defer C.llama_free_params(params)
 
@@ -128,9 +127,6 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	cLogitBias := C.CString(po.LogitBias)
 	defer C.free(unsafe.Pointer(cLogitBias))
 
-	cPathPromptCache := C.CString(po.PathPromptCache)
-	defer C.free(unsafe.Pointer(cPathPromptCache))
-
 	cMainGPU := C.CString(po.MainGPU)
 	defer C.free(unsafe.Pointer(cMainGPU))
 
@@ -143,9 +139,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
-		cPathPromptCache, C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
-		cMainGPU, cTensorSplit,
-		C.bool(po.PromptCacheRO),
+		C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
 	)
 	defer C.llama_free_params(params)
 
diff --git a/llama/options.go b/llama/options.go
index 3cc72a53..ca589cc3 100644
--- a/llama/options.go
+++ b/llama/options.go
@@ -57,11 +57,9 @@ type PredictOptions struct {
 	LogitBias         string
 	TokenCallback     func(string) bool
 
-	PathPromptCache             string
-	MLock, MMap, PromptCacheAll bool
-	PromptCacheRO               bool
-	MainGPU                     string
-	TensorSplit                 string
+	MLock, MMap bool
+	MainGPU     string
+	TensorSplit string
 }
 
 type PredictOption func(p *PredictOptions)
@@ -182,14 +180,6 @@ var Debug PredictOption = func(p *PredictOptions) {
 	p.DebugMode = true
 }
 
-var EnablePromptCacheAll PredictOption = func(p *PredictOptions) {
-	p.PromptCacheAll = true
-}
-
-var EnablePromptCacheRO PredictOption = func(p *PredictOptions) {
-	p.PromptCacheRO = true
-}
-
 var EnableMLock ModelOption = func(p *ModelOptions) {
 	p.MLock = true
 }
@@ -284,13 +274,6 @@ func SetTemperature(temp float64) PredictOption {
 	}
 }
 
-// SetPathPromptCache sets the session file to store the prompt cache.
-func SetPathPromptCache(f string) PredictOption {
-	return func(p *PredictOptions) {
-		p.PathPromptCache = f
-	}
-}
-
 // SetPenalty sets the repetition penalty for text generation.
 func SetPenalty(penalty float64) PredictOption {
 	return func(p *PredictOptions) {