diff --git a/Dockerfile b/Dockerfile index 7c882852..c50665b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,8 +11,8 @@ RUN mkdir -p /usr/local && tar xz -C /usr/local /etc/apt/keyrings/rocm.gpg && \ + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} focal main" > /etc/apt/sources.list.d/rocm.list && \ + echo "Package: *" > /etc/apt/preferences.d/rocm-pin-600 && \ + echo "Pin: release o=repo.radeon.com" >> /etc/apt/preferences.d/rocm-pin-600 && \ + echo "Pin-Priority: 600" >> /etc/apt/preferences.d/rocm-pin-600 && \ + apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y rocm-hip-libraries rocm-device-libs rocm-libs rocm-ocl-icd rocm-hip-sdk rocm-hip-libraries rocm-cmake rocm-clang-ocl rocm-dev -# centos8 arm64 dependencies -FROM --platform=linux/arm64 nvidia/cuda-arm64:11.3.1-devel-centos8 AS base-arm64 -RUN sed -i -e 's/mirrorlist/#mirrorlist/g' -e 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* -RUN yum install -y git cmake +ENV ROCM_PATH=/opt/rocm + +# Ubuntu 22.04 arm64 dependencies +FROM --platform=linux/arm64 nvidia/cuda:11.7.1-devel-ubuntu22.04 AS base-arm64 +RUN apt-get update && \ + apt-get install -y wget && \ + wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-aarch64.sh" -O /tmp/cmake-installer.sh && \ + chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr FROM base-${TARGETARCH} ARG TARGETARCH ARG GOFLAGS="'-ldflags -w -s'" +ARG CGO_CFLAGS +ARG CLBLAST_VER=1.6.1 + +# Common toolchain +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y gcc-11 g++-11 cpp-11 git ocl-icd-opencl-dev && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11 + +# CLBlast +RUN wget -qO- https://github.com/CNugteren/CLBlast/archive/refs/tags/${CLBLAST_VER}.tar.gz | tar zxv -C /tmp/ && \ + cd /tmp/CLBlast-${CLBLAST_VER} && mkdir build && cd build && cmake .. && make && make install # install go ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz @@ -26,6 +51,7 @@ COPY . . ENV GOOS=linux ENV GOARCH=$TARGETARCH ENV GOFLAGS=$GOFLAGS +ENV CGO_CFLAGS=${CGO_CFLAGS} -RUN /usr/local/go/bin/go generate -tags cuda ./... && \ - /usr/local/go/bin/go build -tags cuda . +RUN /usr/local/go/bin/go generate ./... && \ + /usr/local/go/bin/go build . diff --git a/README.md b/README.md index 923290d5..84f94089 100644 --- a/README.md +++ b/README.md @@ -185,8 +185,6 @@ ollama list ## Building -### Generic (CPU) - Install `cmake` and `go`: ``` @@ -202,32 +200,36 @@ Then build the binary: go build . ``` -### CUDA (NVIDIA) +### Linux/Windows CUDA (NVIDIA) *Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* -Install `cmake` and `golang` as well as [NVIDIA CUDA](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) development and runtime packages. +Note: at present, Ollama is optimized for GPU usage on linux, and requires the CUDA libraries at a minimum to compile even if you do not have an NVIDIA GPU. + +Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages. Then generate dependencies: ``` -go generate -tags cuda ./... +go generate ./... ``` Then build the binary: ``` -go build -tags cuda . +go build . ``` -### ROCm (AMD) +### Linux ROCm (AMD) *Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!* Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`. Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies: ``` -CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate -tags rocm ./... +CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./... ``` Then build the binary: ``` -go build -tags rocm +go build . ``` +ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root. + ### Running local builds Next, start the server: diff --git a/gpu/gpu.go b/gpu/gpu.go new file mode 100644 index 00000000..146c711e --- /dev/null +++ b/gpu/gpu.go @@ -0,0 +1,119 @@ +//go:build linux || windows + +package gpu + +/* +#include "gpu_info.h" + +*/ +import "C" +import ( + "fmt" + "log" + "sync" + "unsafe" + + "github.com/jmorganca/ollama/api" +) + +type handles struct { + cuda *C.cuda_handle_t + rocm *C.rocm_handle_t +} + +var gpuMutex sync.Mutex +var gpuHandles *handles = nil + +// Note: gpuMutex must already be held +func initGPUHandles() { + log.Printf("Detecting GPU type") + gpuHandles = &handles{nil, nil} + var resp C.cuda_init_resp_t + C.cuda_init(&resp) + if resp.err != nil { + log.Printf("CUDA not detected: %s", C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + + var resp C.rocm_init_resp_t + C.rocm_init(&resp) + if resp.err != nil { + log.Printf("ROCm not detected: %s", C.GoString(resp.err)) + C.free(unsafe.Pointer(resp.err)) + } else { + log.Printf("Radeon GPU detected") + rocm := resp.rh + gpuHandles.rocm = &rocm + } + } else { + log.Printf("Nvidia GPU detected") + cuda := resp.ch + gpuHandles.cuda = &cuda + } +} + +func GetGPUInfo() GpuInfo { + // TODO - consider exploring lspci (and equivalent on windows) to check for + // GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries + gpuMutex.Lock() + defer gpuMutex.Unlock() + if gpuHandles == nil { + initGPUHandles() + } + + var memInfo C.mem_info_t + var resp GpuInfo + if gpuHandles.cuda != nil { + C.cuda_check_vram(*gpuHandles.cuda, &memInfo) + resp.Driver = "CUDA" + } else if gpuHandles.rocm != nil { + C.rocm_check_vram(*gpuHandles.rocm, &memInfo) + resp.Driver = "ROCM" + } else { + C.cpu_check_ram(&memInfo) + resp.Driver = "CPU" + } + if memInfo.err != nil { + log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err)) + C.free(unsafe.Pointer(memInfo.err)) + } + resp.FreeMemory = uint64(memInfo.free) + resp.TotalMemory = uint64(memInfo.total) + return resp +} + +func CheckVRAM() (int64, error) { + gpuInfo := GetGPUInfo() + if gpuInfo.FreeMemory > 0 && gpuInfo.Driver != "CPU" { + return int64(gpuInfo.FreeMemory), nil + } + return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation +} + +func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { + if opts.NumGPU != -1 { + return opts.NumGPU + } + info := GetGPUInfo() + if info.Driver == "CPU" { + return 0 + } + + /* + Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. + We can store the model weights and the kv cache in vram, + to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. + */ + bytesPerLayer := uint64(fileSizeBytes / numLayer) + + // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors + layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4 + + // TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU + // if int64(layers) < numLayer { + // log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024)) + // return 0 + // } + log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer) + + return layers +} diff --git a/llm/gpu_darwin.go b/gpu/gpu_darwin.go similarity index 60% rename from llm/gpu_darwin.go rename to gpu/gpu_darwin.go index 39ee4f75..e4a9456a 100644 --- a/llm/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -1,7 +1,8 @@ //go:build darwin -package llm +package gpu +import "C" import ( "github.com/jmorganca/ollama/api" ) @@ -9,11 +10,25 @@ import ( // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs func CheckVRAM() (int64, error) { // TODO - assume metal, and return free memory? - return 0, errNvidiaSMI + return 0, nil } +func GetGPUInfo() GpuInfo { + // TODO - Metal vs. x86 macs... + + return GpuInfo{ + Driver: "METAL", + TotalMemory: 0, + FreeMemory: 0, + } +} + func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { // default to enable metal on macOS return 1 } + +func nativeInit() error { + return nil +} diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h new file mode 100644 index 00000000..7de36465 --- /dev/null +++ b/gpu/gpu_info.h @@ -0,0 +1,49 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_H__ +#define __GPU_INFO_H__ +#include +#include +#include + +#ifndef _WIN32 +#include +#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) +#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) +#define LOAD_ERR() dlerror() +#define UNLOAD_LIBRARY(handle) dlclose(handle) +#else +#include +#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib) +#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym) +#define UNLOAD_LIBRARY(handle) FreeLibrary(handle) + +// TODO - refactor this with proper error message handling on windows +inline static char *LOAD_ERR() { + static char errbuf[8]; + snprintf(errbuf, 8, "0x%lx", GetLastError()); + return errbuf; +} + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct mem_info { + uint64_t total; + uint64_t free; + char *err; // If non-nill, caller responsible for freeing +} mem_info_t; + +void cpu_check_ram(mem_info_t *resp); + +#ifdef __cplusplus +} +#endif + +#include "gpu_info_cuda.h" +#include "gpu_info_rocm.h" + +#endif // __GPU_INFO_H__ +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cpu.c b/gpu/gpu_info_cpu.c new file mode 100644 index 00000000..a7987cd4 --- /dev/null +++ b/gpu/gpu_info_cpu.c @@ -0,0 +1,42 @@ +#include "gpu_info.h" +// Fallbacks for CPU mode + +#ifdef _WIN32 +#include +void cpu_check_ram(mem_info_t *resp) { + resp->err = NULL; + MEMORYSTATUSEX info; + if (GlobalMemoryStatusEx(&info) != 0) { + resp->total = info.ullTotalPhys; + resp->free = info.ullAvailPhys; + } else { + resp->err = strdup(LOAD_ERR()); + } + return; +} + +#elif __linux__ +#include +#include +#include +void cpu_check_ram(mem_info_t *resp) { + struct sysinfo info; + resp->err = NULL; + if (sysinfo(&info) != 0) { + resp->err = strdup(strerror(errno)); + } else { + resp->total = info.totalram * info.mem_unit; + resp->free = info.freeram * info.mem_unit; + } + return; +} + +#elif __APPLE__ +// TODO consider an Apple implementation that does something useful +// mem_info_t cpu_check_ram() { +// mem_info_t resp = {0, 0, NULL}; +// return resp; +// } +#else +#error "Unsupported platform" +#endif diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c new file mode 100644 index 00000000..0b2ac867 --- /dev/null +++ b/gpu/gpu_info_cuda.c @@ -0,0 +1,110 @@ +#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs? + +#include "gpu_info_cuda.h" + +#include + +#ifndef _WIN32 +const char *cuda_lib_paths[] = { + "libnvidia-ml.so", + "/usr/local/cuda/lib64/libnvidia-ml.so", + NULL, +}; +#else +const char *cuda_lib_paths[] = { + "nvml.dll", + "", + NULL, +}; +#endif + +void cuda_init(cuda_init_resp_t *resp) { + resp->err = NULL; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + struct lookup { + char *s; + void **p; + } l[4] = { + {"nvmlInit_v2", (void *)&resp->ch.initFn}, + {"nvmlShutdown", (void *)&resp->ch.shutdownFn}, + {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle}, + {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo}, + }; + + for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) { + resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY); + } + if (!resp->ch.handle) { + snprintf(buf, buflen, + "Unable to load %s library to query for Nvidia GPUs: %s", + cuda_lib_paths[0], LOAD_ERR()); + resp->err = strdup(buf); + return; + } + + for (i = 0; i < 4; i++) { // TODO - fix this to use a null terminated list + *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s); + if (!l[i].p) { + UNLOAD_LIBRARY(resp->ch.handle); + resp->ch.handle = NULL; + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + LOAD_ERR()); + resp->err = strdup(buf); + return; + } + } + return; +} + +void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) { + resp->err = NULL; + nvmlDevice_t device; + nvmlMemory_t memInfo = {0}; + nvmlReturn_t ret; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + if (h.handle == NULL) { + resp->err = strdup("nvml handle sn't initialized"); + return; + } + + ret = (*h.initFn)(); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "nvml vram init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + // TODO - handle multiple GPUs + ret = (*h.getHandle)(0, &device); + if (ret != NVML_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "unable to get device handle: %d", ret); + resp->err = strdup(buf); + return; + } + + ret = (*h.getMemInfo)(device, &memInfo); + if (ret != NVML_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "device memory info lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + resp->total = memInfo.total; + resp->free = memInfo.free; + + ret = (*h.shutdownFn)(); + if (ret != NVML_SUCCESS) { + snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret); + resp->err = strdup(buf); + } + + return; +} +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_cuda.h b/gpu/gpu_info_cuda.h new file mode 100644 index 00000000..7d13cb6a --- /dev/null +++ b/gpu/gpu_info_cuda.h @@ -0,0 +1,35 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_CUDA_H__ +#define __GPU_INFO_CUDA_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum nvmlReturn_enum { + NVML_SUCCESS = 0, + // Other values omitted for now... +} nvmlReturn_t; +typedef void *nvmlDevice_t; // Opaque is sufficient +typedef struct nvmlMemory_st { + unsigned long long total; + unsigned long long free; + unsigned long long used; +} nvmlMemory_t; + +typedef struct cuda_handle { + void *handle; + nvmlReturn_t (*initFn)(void); + nvmlReturn_t (*shutdownFn)(void); + nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *); + nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *); +} cuda_handle_t; + +typedef struct cuda_init_resp { + char *err; // If err is non-null handle is invalid + cuda_handle_t ch; +} cuda_init_resp_t; + +void cuda_init(cuda_init_resp_t *resp); +void cuda_check_vram(cuda_handle_t ch, mem_info_t *resp); + +#endif // __GPU_INFO_CUDA_H__ +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c new file mode 100644 index 00000000..88bd2dad --- /dev/null +++ b/gpu/gpu_info_rocm.c @@ -0,0 +1,111 @@ +#ifndef __APPLE__ + +#include "gpu_info_rocm.h" + +#include + +#ifndef _WIN32 +const char *rocm_lib_paths[] = { + "librocm_smi64.so", + "/opt/rocm/lib/librocm_smi64.so", + NULL, +}; +#else +// TODO untested +const char *rocm_lib_paths[] = { + "rocm_smi64.dll", + "/opt/rocm/lib/rocm_smi64.dll", + NULL, +}; +#endif + +void rocm_init(rocm_init_resp_t *resp) { + resp->err = NULL; + const int buflen = 256; + char buf[buflen + 1]; + int i; + struct lookup { + char *s; + void **p; + } l[4] = { + {"rsmi_init", (void *)&resp->rh.initFn}, + {"rsmi_shut_down", (void *)&resp->rh.shutdownFn}, + {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn}, + {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn}, + // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle }, + }; + + for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) { + resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY); + } + if (!resp->rh.handle) { + snprintf(buf, buflen, + "Unable to load %s library to query for Radeon GPUs: %s\n", + rocm_lib_paths[0], LOAD_ERR()); + resp->err = strdup(buf); + return; + } + + for (i = 0; i < 4; i++) { + *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s); + if (!l[i].p) { + UNLOAD_LIBRARY(resp->rh.handle); + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, + LOAD_ERR()); + resp->err = strdup(buf); + return; + } + } + return; +} + +void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { + resp->err = NULL; + // uint32_t num_devices; + // uint16_t device; + uint64_t totalMem = 0; + uint64_t usedMem = 0; + rsmi_status_t ret; + const int buflen = 256; + char buf[buflen + 1]; + int i; + + ret = (*h.initFn)(0); + if (ret != RSMI_STATUS_SUCCESS) { + snprintf(buf, buflen, "rocm vram init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + // TODO - iterate through devices... ret = + // rsmi_num_monitor_devices(&num_devices); + + // ret = (*h.getHandle)(0, &device); + // if (ret != RSMI_STATUS_SUCCESS) { + // printf("rocm vram device lookup failure: %d\n", ret); + // return -1; + // } + + // Get total memory - used memory for available memory + ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem); + if (ret != RSMI_STATUS_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem); + if (ret != RSMI_STATUS_SUCCESS) { + (*h.shutdownFn)(); + snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret); + resp->err = strdup(buf); + return; + } + + (*h.shutdownFn)(); + resp->total = totalMem; + resp->free = totalMem - usedMem; + return; +} + +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_info_rocm.h b/gpu/gpu_info_rocm.h new file mode 100644 index 00000000..8d7a04ae --- /dev/null +++ b/gpu/gpu_info_rocm.h @@ -0,0 +1,36 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_ROCM_H__ +#define __GPU_INFO_ROCM_H__ +#include "gpu_info.h" + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum rsmi_status_return { + RSMI_STATUS_SUCCESS = 0, + // Other values omitted for now... +} rsmi_status_t; + +typedef enum rsmi_memory_type { + RSMI_MEM_TYPE_VRAM = 0, + RSMI_MEM_TYPE_VIS_VRAM, + RSMI_MEM_TYPE_GTT, +} rsmi_memory_type_t; + +typedef struct rocm_handle { + void *handle; + rsmi_status_t (*initFn)(uint64_t); + rsmi_status_t (*shutdownFn)(void); + rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); + rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *); + // rsmi_status_t (*getHandle)(uint32_t, uint16_t *); +} rocm_handle_t; + +typedef struct rocm_init_resp { + char *err; // If err is non-null handle is invalid + rocm_handle_t rh; +} rocm_init_resp_t; + +void rocm_init(rocm_init_resp_t *resp); +void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp); + +#endif // __GPU_INFO_ROCM_H__ +#endif // __APPLE__ \ No newline at end of file diff --git a/gpu/gpu_test.go b/gpu/gpu_test.go new file mode 100644 index 00000000..cbdcf3ec --- /dev/null +++ b/gpu/gpu_test.go @@ -0,0 +1,26 @@ +package gpu + +import ( + "runtime" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestBasicGetGPUInfo(t *testing.T) { + info := GetGPUInfo() + assert.Contains(t, "CUDA ROCM CPU METAL", info.Driver) + + switch runtime.GOOS { + case "darwin": + // TODO - remove this once MacOS returns some size for CPU + return + case "linux", "windows": + assert.Greater(t, info.TotalMemory, uint64(0)) + assert.Greater(t, info.FreeMemory, uint64(0)) + default: + return + } +} + +// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected diff --git a/gpu/types.go b/gpu/types.go new file mode 100644 index 00000000..a84a0a8d --- /dev/null +++ b/gpu/types.go @@ -0,0 +1,10 @@ +package gpu + +// Beginning of an `ollama info` command +type GpuInfo struct { + Driver string `json:"driver,omitempty"` + TotalMemory uint64 `json:"total_memory,omitempty"` + FreeMemory uint64 `json:"free_memory,omitempty"` + + // TODO add other useful attributes about the card here for discovery information +} diff --git a/llm/accelerator_cuda.go b/llm/accelerator_cuda.go deleted file mode 100644 index f21d6d62..00000000 --- a/llm/accelerator_cuda.go +++ /dev/null @@ -1,67 +0,0 @@ -//go:build cuda - -package llm - -import ( - "bufio" - "bytes" - "errors" - "fmt" - "log" - "os/exec" - "path" - "strconv" - "strings" - - "github.com/jmorganca/ollama/format" -) - -var ( - errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed") - errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only") -) - -// acceleratedRunner returns the runner for this accelerator given the provided buildPath string. -func acceleratedRunner(buildPath string) []ModelRunner { - return []ModelRunner{ - ModelRunner{ - Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), - Accelerated: true, - }, - } -} - -// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs -func CheckVRAM() (int64, error) { - cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits") - var stdout bytes.Buffer - cmd.Stdout = &stdout - err := cmd.Run() - if err != nil { - return 0, errNoAccel - } - - var freeMiB int64 - scanner := bufio.NewScanner(&stdout) - for scanner.Scan() { - line := scanner.Text() - if strings.Contains(line, "[Insufficient Permissions]") { - return 0, fmt.Errorf("GPU support may not enabled, check you have installed GPU drivers and have the necessary permissions to run nvidia-smi") - } - - vram, err := strconv.ParseInt(strings.TrimSpace(line), 10, 64) - if err != nil { - return 0, fmt.Errorf("failed to parse available VRAM: %v", err) - } - - freeMiB += vram - } - - freeBytes := freeMiB * 1024 * 1024 - if freeBytes < 2*format.GigaByte { - log.Printf("less than 2 GB VRAM available") - return 0, errAvailableVRAM - } - - return freeBytes, nil -} diff --git a/llm/accelerator_none.go b/llm/accelerator_none.go deleted file mode 100644 index 442d884a..00000000 --- a/llm/accelerator_none.go +++ /dev/null @@ -1,21 +0,0 @@ -//go:build !rocm && !cuda - -package llm - -import ( - "errors" -) - -var ( - errNoAccel = errors.New("no accelerator support in this binary") -) - -// acceleratedRunner returns the runner for this accelerator given the provided buildPath string. -func acceleratedRunner(buildPath string) []ModelRunner { - return make([]ModelRunner, 0, 1) -} - -// CheckVRAM is a stub with no accelerator. -func CheckVRAM() (int64, error) { - return 0, errNoGPU -} diff --git a/llm/accelerator_rocm.go b/llm/accelerator_rocm.go deleted file mode 100644 index e71b4ea6..00000000 --- a/llm/accelerator_rocm.go +++ /dev/null @@ -1,85 +0,0 @@ -//go:build rocm - -package llm - -import ( - "bytes" - "encoding/csv" - "errors" - "fmt" - "io" - "log" - "os" - "os/exec" - "path" - "path/filepath" - "strconv" - "strings" -) - -var errNoAccel = errors.New("rocm-smi command failed") - -// acceleratedRunner returns the runner for this accelerator given the provided buildPath string. -func acceleratedRunner(buildPath string) []ModelRunner { - return []ModelRunner{ - ModelRunner{ - Path: path.Join(buildPath, "rocm", "bin", "ollama-runner"), - Accelerated: true, - }, - } -} - -// CheckVRAM returns the available VRAM in MiB on Linux machines with AMD GPUs -func CheckVRAM() (int64, error) { - rocmHome := os.Getenv("ROCM_PATH") - if rocmHome == "" { - rocmHome = os.Getenv("ROCM_HOME") - } - if rocmHome == "" { - log.Println("warning: ROCM_PATH is not set. Trying a likely fallback path, but it is recommended to set this variable in the environment.") - rocmHome = "/opt/rocm" - } - cmd := exec.Command(filepath.Join(rocmHome, "bin/rocm-smi"), "--showmeminfo", "VRAM", "--csv") - var stdout bytes.Buffer - cmd.Stdout = &stdout - err := cmd.Run() - if err != nil { - return 0, errNoAccel - } - csvData := csv.NewReader(&stdout) - // llama.cpp or ROCm don't seem to understand splitting the VRAM allocations across them properly, so try to find the biggest card instead :(. FIXME. - totalBiggestCard := int64(0) - bigCardName := "" - for { - record, err := csvData.Read() - if err == io.EOF { - break - } - if err != nil { - return 0, fmt.Errorf("failed to parse available VRAM: %v", err) - } - if !strings.HasPrefix(record[0], "card") { - continue - } - cardTotal, err := strconv.ParseInt(record[1], 10, 64) - if err != nil { - return 0, err - } - cardUsed, err := strconv.ParseInt(record[2], 10, 64) - if err != nil { - return 0, err - } - possible := (cardTotal - cardUsed) - log.Printf("ROCm found %d MiB of available VRAM on device %q", possible/1024/1024, record[0]) - if possible > totalBiggestCard { - totalBiggestCard = possible - bigCardName = record[0] - } - } - if totalBiggestCard == 0 { - log.Printf("found ROCm GPU but failed to parse free VRAM!") - return 0, errNoAccel - } - log.Printf("ROCm selecting device %q", bigCardName) - return totalBiggestCard, nil -} diff --git a/llm/ext_server.go b/llm/ext_server.go index 6e31dca7..bd026043 100644 --- a/llm/ext_server.go +++ b/llm/ext_server.go @@ -1,7 +1,7 @@ package llm /* -#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common +#cgo CFLAGS: -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable @@ -25,6 +25,8 @@ package llm #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a #cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a + +// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a #cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a @@ -35,7 +37,7 @@ package llm #cgo windows LDFLAGS: -lext_server_shared -lpthread #include -#include "examples/server/server.h" +#include "server.h" */ import "C" @@ -43,25 +45,51 @@ import ( "bytes" "context" "encoding/json" - "errors" "fmt" "log" "os" "runtime" + "strings" "sync" "time" "unsafe" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/gpu" ) -func errWrap(resp C.ext_server_err) error { - if resp.code == 0 { - return nil +func newExtServerResp(len C.size_t) C.ext_server_resp_t { + var resp C.ext_server_resp_t + resp.msg_len = len + bytes := make([]byte, len) + resp.msg = (*C.char)(C.CBytes(bytes)) + return resp +} + +func freeExtServerResp(resp C.ext_server_resp_t) { + if resp.msg_len == 0 { + return } - err := fmt.Errorf(C.GoString(resp.err)) - C.free(unsafe.Pointer(resp.err)) - return err + C.free(unsafe.Pointer(resp.msg)) +} + +func extServerResponseToErr(resp C.ext_server_resp_t) error { + return fmt.Errorf(C.GoString(resp.msg)) +} + +type extServer interface { + LLM + llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) + llama_server_start() + llama_server_stop() + llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) + llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) + llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) + llama_server_release_task_result(result *C.ext_server_task_result_t) + llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) + llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) + llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) + llama_server_release_json_resp(json_resp **C.char) } type llamaExtServer struct { @@ -71,21 +99,61 @@ type llamaExtServer struct { // Note: current implementation does not support concurrent instantiations var mutex sync.Mutex -func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (*llamaExtServer, error) { +func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { + C.llama_server_init(sparams, err) +} +func (llm *llamaExtServer) llama_server_start() { + C.llama_server_start() +} +func (llm *llamaExtServer) llama_server_stop() { + C.llama_server_stop() +} + +func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { + C.llama_server_completion(json_req, resp) +} +func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { + C.llama_server_completion_next_result(task_id, resp) +} +func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { + C.llama_server_completion_cancel(task_id, err) +} +func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { + C.llama_server_release_task_result(result) +} + +func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.llama_server_tokenize(json_req, json_resp, err) +} +func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.llama_server_detokenize(json_req, json_resp, err) +} +func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.llama_server_embedding(json_req, json_resp, err) +} +func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) { + C.llama_server_release_json_resp(json_resp) +} + +func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + server := &llamaExtServer{opts} + return newExtServer(server, model, adapters, projectors, numLayers, opts) +} + +func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { if !mutex.TryLock() { log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete") mutex.Lock() } - server := &llamaExtServer{opts} fileInfo, err := os.Stat(model) if err != nil { return nil, err } - var sparams C.ext_server_params + var sparams C.ext_server_params_t sparams.model = C.CString(model) defer C.free(unsafe.Pointer(sparams.model)) - numGPU := NumGPU(numLayers, fileInfo.Size(), opts) + numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts) sparams.embedding = true sparams.n_ctx = C.uint(opts.NumCtx) @@ -97,10 +165,14 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in // Always use the value encoded in the model sparams.rope_freq_base = 0.0 sparams.rope_freq_scale = 0.0 + sparams.memory_f16 = C.bool(opts.F16KV) + sparams.use_mlock = C.bool(opts.UseMLock) + sparams.use_mmap = C.bool(opts.UseMMap) + sparams.numa = C.bool(opts.UseNUMA) sparams.lora_adapters = nil for i := 0; i < len(adapters); i++ { - la := (*C.ext_server_lora_adapter)(C.malloc(C.sizeof_struct_ext_server_lora_adapter)) + la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t)) defer C.free(unsafe.Pointer(la)) la.adapter = C.CString(adapters[i]) defer C.free(unsafe.Pointer(la.adapter)) @@ -116,11 +188,13 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in } } - // TODO - implement ME - // if len(projectors) > 0 { - // // TODO: applying multiple projectors is not supported by the llama.cpp server yet - // params = append(params, "--mmproj", projectors[0]) - // } + if len(projectors) > 0 { + // TODO: applying multiple projectors is not supported by the llama.cpp server yet + sparams.mmproj = C.CString(projectors[0]) + defer C.free(unsafe.Pointer(sparams.mmproj)) + } else { + sparams.mmproj = nil + } if opts.NumThread > 0 { sparams.n_threads = C.uint(opts.NumThread) @@ -128,136 +202,167 @@ func newLlamaExtServer(model string, adapters, projectors []string, numLayers in sparams.n_threads = C.uint(runtime.NumCPU()) } - sparams.memory_f16 = false - if opts.F16KV { - sparams.memory_f16 = true - } - sparams.use_mlock = false - if opts.UseMLock { - sparams.use_mlock = true - } - sparams.use_mmap = true - if !opts.UseMMap { - sparams.use_mmap = false - } - sparams.numa = false - if opts.UseNUMA { - sparams.numa = true - } - log.Printf("Initializing internal llama server") - err = errWrap(C.llama_server_init(&sparams)) - if err != nil { - return nil, err + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + server.llama_server_init(&sparams, &resp) + if resp.id < 0 { + return nil, extServerResponseToErr(resp) } log.Printf("Starting internal llama main loop") - C.llama_server_start() + server.llama_server_start() return server, nil } -func (llm *llamaExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { +func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error { + return predict(llm, llm.Options, ctx, pred, fn) +} + +func predict(llm extServer, opts api.Options, ctx context.Context, predict PredictOpts, fn func(PredictResult)) error { + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var imageData []ImageData + if len(predict.Images) > 0 { + for cnt, i := range predict.Images { + imageData = append(imageData, ImageData{Data: i, ID: cnt}) + } + } + log.Printf("loaded %d images", len(imageData)) request := map[string]any{ "prompt": predict.Prompt, "stream": true, - "n_predict": llm.NumPredict, - "n_keep": llm.NumKeep, - "temperature": llm.Temperature, - "top_k": llm.TopK, - "top_p": llm.TopP, - "tfs_z": llm.TFSZ, - "typical_p": llm.TypicalP, - "repeat_last_n": llm.RepeatLastN, - "repeat_penalty": llm.RepeatPenalty, - "presence_penalty": llm.PresencePenalty, - "frequency_penalty": llm.FrequencyPenalty, - "mirostat": llm.Mirostat, - "mirostat_tau": llm.MirostatTau, - "mirostat_eta": llm.MirostatEta, - "penalize_nl": llm.PenalizeNewline, - "seed": llm.Seed, - "stop": llm.Stop, + "n_predict": opts.NumPredict, + "n_keep": opts.NumKeep, + "temperature": opts.Temperature, + "top_k": opts.TopK, + "top_p": opts.TopP, + "tfs_z": opts.TFSZ, + "typical_p": opts.TypicalP, + "repeat_last_n": opts.RepeatLastN, + "repeat_penalty": opts.RepeatPenalty, + "presence_penalty": opts.PresencePenalty, + "frequency_penalty": opts.FrequencyPenalty, + "mirostat": opts.Mirostat, + "mirostat_tau": opts.MirostatTau, + "mirostat_eta": opts.MirostatEta, + "penalize_nl": opts.PenalizeNewline, + "seed": opts.Seed, + "stop": opts.Stop, + "image_data": imageData, } if predict.Format == "json" { request["grammar"] = jsonGrammar } - // Handling JSON marshaling with special characters unescaped. - buffer := &bytes.Buffer{} - enc := json.NewEncoder(buffer) - enc.SetEscapeHTML(false) + retryDelay := 100 * time.Microsecond + for retries := 0; retries < maxRetries; retries++ { + if retries > 0 { + time.Sleep(retryDelay) // wait before retrying + retryDelay *= 2 // exponential backoff + } - if err := enc.Encode(request); err != nil { - return fmt.Errorf("failed to marshal data: %w", err) - } + // Handling JSON marshaling with special characters unescaped. + buffer := &bytes.Buffer{} + enc := json.NewEncoder(buffer) + enc.SetEscapeHTML(false) - req := C.CString(buffer.String()) - defer C.free(unsafe.Pointer(req)) + if err := enc.Encode(request); err != nil { + return fmt.Errorf("failed to marshal data: %w", err) + } - cmpCtx := C.llama_server_completion(req) - if cmpCtx.task_id < 0 { - defer C.free(unsafe.Pointer(cmpCtx.err)) - return fmt.Errorf(C.GoString(cmpCtx.err)) - } + req := C.CString(buffer.String()) + defer C.free(unsafe.Pointer(req)) - for { - select { - case <-ctx.Done(): - // This handles the request cancellation - return errWrap(C.llama_server_completion_cancel(cmpCtx.task_id)) - default: - result := C.llama_server_completion_next_result(cmpCtx.task_id) - if result.result_json != nil { - defer C.free(unsafe.Pointer(result.result_json)) - } - var p prediction - if err := json.Unmarshal([]byte(C.GoString(result.result_json)), &p); err != nil { - err2 := errWrap(C.llama_server_completion_cancel(cmpCtx.task_id)) - return errors.Join(fmt.Errorf("error unmarshaling llm prediction response: %w", err), err2) - } + llm.llama_server_completion(req, &resp) + if resp.id < 0 { + return extServerResponseToErr(resp) + } - if p.Content != "" { - fn(PredictResult{ - // Model: predict.Model, // XXX remove or replace? - CreatedAt: time.Now().UTC(), - Content: p.Content, - }) - } + retryNeeded := false + out: + for { + select { + case <-ctx.Done(): + // This handles the request cancellation + llm.llama_server_completion_cancel(resp.id, &resp) + if resp.id < 0 { + return extServerResponseToErr(resp) + } else { + return nil + } + default: + var result C.ext_server_task_result_t + llm.llama_server_completion_next_result(resp.id, &result) + json_resp := C.GoString(result.json_resp) + llm.llama_server_release_task_result(&result) - if p.Stop { - fn(PredictResult{ - // Model: predict.Model, // XXX remove or replace? - CreatedAt: time.Now().UTC(), - TotalDuration: time.Since(predict.CheckpointStart), - Done: true, - PromptEvalCount: p.Timings.PromptN, - PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), - EvalCount: p.Timings.PredictedN, - EvalDuration: parseDurationMs(p.Timings.PredictedMS), - }) - return nil + var p prediction + if err := json.Unmarshal([]byte(json_resp), &p); err != nil { + llm.llama_server_completion_cancel(resp.id, &resp) + if resp.id < 0 { + return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg)) + } else { + return fmt.Errorf("error unmarshaling llm prediction response: %w", err) + } + } + + if bool(result.error) && strings.Contains(json_resp, "slot unavailable") { + retryNeeded = true + // task will already be canceled + break out + } + + if p.Content != "" { + fn(PredictResult{ + Content: p.Content, + }) + } + + if p.Stop { + fn(PredictResult{ + Done: true, + PromptEvalCount: p.Timings.PromptN, + PromptEvalDuration: parseDurationMs(p.Timings.PromptMS), + EvalCount: p.Timings.PredictedN, + EvalDuration: parseDurationMs(p.Timings.PredictedMS), + }) + return nil + } } } + if !retryNeeded { + return nil // success + } } + + // should never reach here ideally + return fmt.Errorf("max retries exceeded") +} +func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { + return encode(llm, ctx, prompt) } -func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { +func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) { data, err := json.Marshal(TokenizeRequest{Content: prompt}) if err != nil { return nil, fmt.Errorf("marshaling encode data: %w", err) } req := C.CString(string(data)) defer C.free(unsafe.Pointer(req)) - var resp C.ext_server_resp - err = errWrap(C.llama_server_tokenize(req, &resp)) - if resp.json_resp != nil { - defer C.free(unsafe.Pointer(resp.json_resp)) + var json_resp *C.char + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + llm.llama_server_tokenize(req, &json_resp, &resp) + if resp.id < 0 { + return nil, extServerResponseToErr(resp) } + defer llm.llama_server_release_json_resp(&json_resp) var encoded TokenizeResponse - if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &encoded); err2 != nil { + if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil { return nil, fmt.Errorf("unmarshal encode response: %w", err2) } @@ -265,6 +370,10 @@ func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, er } func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) { + return decode(llm, ctx, tokens) +} + +func decode(llm extServer, ctx context.Context, tokens []int) (string, error) { if len(tokens) == 0 { return "", nil } @@ -275,14 +384,17 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er req := C.CString(string(data)) defer C.free(unsafe.Pointer(req)) - var resp C.ext_server_resp - err = errWrap(C.llama_server_detokenize(req, &resp)) - if resp.json_resp != nil { - defer C.free(unsafe.Pointer(resp.json_resp)) + var json_resp *C.char + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + llm.llama_server_detokenize(req, &json_resp, &resp) + if resp.id < 0 { + return "", extServerResponseToErr(resp) } + defer llm.llama_server_release_json_resp(&json_resp) var decoded DetokenizeResponse - if err2 := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &decoded); err2 != nil { + if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil { return "", fmt.Errorf("unmarshal encode response: %w", err2) } @@ -290,6 +402,9 @@ func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, er } func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { + return embedding(llm, ctx, input) +} +func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) { data, err := json.Marshal(TokenizeRequest{Content: input}) if err != nil { return nil, fmt.Errorf("error marshaling embed data: %w", err) @@ -297,29 +412,28 @@ func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float req := C.CString(string(data)) defer C.free(unsafe.Pointer(req)) - var resp C.ext_server_resp - err = errWrap(C.llama_server_embedding(req, &resp)) - if resp.json_resp != nil { - defer C.free(unsafe.Pointer(resp.json_resp)) - } - if err != nil { - return nil, err + var json_resp *C.char + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + llm.llama_server_embedding(req, &json_resp, &resp) + if resp.id < 0 { + return nil, extServerResponseToErr(resp) } + defer llm.llama_server_release_json_resp(&json_resp) var embedding EmbeddingResponse - if err := json.Unmarshal([]byte(C.GoString(resp.json_resp)), &embedding); err != nil { + if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil { return nil, fmt.Errorf("unmarshal tokenize response: %w", err) } return embedding.Embedding, nil } -func (llm *llamaExtServer) Ping(ctx context.Context) error { - // TODO - consider some mechanism to check if the main loop and llama.cpp are in a good state - return nil +func (llm *llamaExtServer) Close() { + close(llm) } -func (llm *llamaExtServer) Close() { - C.llama_server_stop() +func close(llm extServer) { + llm.llama_server_stop() mutex.Unlock() } diff --git a/llm/gpu_cuda.go b/llm/gpu_cuda.go deleted file mode 100644 index 0afa8e2b..00000000 --- a/llm/gpu_cuda.go +++ /dev/null @@ -1,57 +0,0 @@ -//go:build linux || windows - -package llm - -import ( - "errors" - "log" - - "github.com/jmorganca/ollama/api" -) - -/* -#cgo windows LDFLAGS: -L"/Program Files/NVIDIA Corporation/NVSMI/" -#cgo linux LDFLAGS: -lnvidia-ml - -#include -#include "examples/server/server.h" -*/ -import "C" - -// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs -func CheckVRAM() (int64, error) { - return int64(C.check_vram()), nil -} - -func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { - if opts.NumGPU != -1 { - return opts.NumGPU - } - freeBytes, err := CheckVRAM() - if err != nil { - if !errors.Is(err, errNvidiaSMI) { - log.Print(err.Error()) - } - // nvidia driver not installed or no nvidia GPU found - return 0 - } - - /* - Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. - We can store the model weights and the kv cache in vram, - to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. - */ - bytesPerLayer := fileSizeBytes / numLayer - - // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors - layers := int(freeBytes/bytesPerLayer) * 3 / 4 - - // TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU - // if int64(layers) < numLayer { - // log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024)) - // return 0 - // } - log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", freeBytes/(1024*1024), layers, numLayer) - - return layers -} diff --git a/llm/llama.cpp/gen_common.sh b/llm/llama.cpp/gen_common.sh index f17d19de..2f75104f 100644 --- a/llm/llama.cpp/gen_common.sh +++ b/llm/llama.cpp/gen_common.sh @@ -1,10 +1,11 @@ # common logic accross linux and darwin init_vars() { + LLAMACPP_DIR=gguf PATCHES="0001-Expose-callable-API-for-server.patch" CMAKE_DEFS="-DLLAMA_ACCELERATE=on" # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings - CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server" + CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static" if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}" else @@ -29,6 +30,6 @@ apply_patches() { } build() { - cmake -S gguf -B ${BUILD_DIR} ${CMAKE_DEFS} - cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 + cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} + cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 } \ No newline at end of file diff --git a/llm/llama.cpp/gen_darwin.sh b/llm/llama.cpp/gen_darwin.sh index 448c595b..f159ceff 100755 --- a/llm/llama.cpp/gen_darwin.sh +++ b/llm/llama.cpp/gen_darwin.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # This script is intended to run inside the go generate # working directory must be ../llm/llama.cpp @@ -30,6 +30,7 @@ git_module_setup apply_patches build +# TODO - improve this to handle test cases that need it to be in "." around the tree # Enable local debug/run usecase if [ -e "gguf/ggml-metal.metal" ]; then cp gguf/ggml-metal.metal ../../ diff --git a/llm/llama.cpp/gen_linux.sh b/llm/llama.cpp/gen_linux.sh index c5405dd8..93c998f4 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/llama.cpp/gen_linux.sh @@ -1,17 +1,73 @@ -#!/bin/sh +#!/bin/bash # This script is intended to run inside the go generate # working directory must be ../llm/llama.cpp set -ex set -o pipefail -# TODO - stopped here - map the variables from above over and refine the case statement below - echo "Starting linux generate script" +if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then + export CUDACXX=/usr/local/cuda/bin/nvcc +fi source $(dirname $0)/gen_common.sh init_vars -CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" -BUILD_DIR="gguf/build/cuda" git_module_setup apply_patches +CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" +BUILD_DIR="gguf/build/cuda" +LIB_DIR="${BUILD_DIR}/lib" +mkdir -p ../../dist/ build +# TODO - explore mechanism to soften the hard cuda dependency on linux +# by conditionally building some archive here that aggregates the cuda libs if present +# so that the cgo flags link this intermediate archive instead of the underlying cuda libs +# +# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \ +# -Wl,--whole-archive \ +# ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \ +# ${BUILD_DIR}/common/libcommon.a \ +# ${BUILD_DIR}/libllama.a \ +# ${BUILD_DIR}/examples/llava/libllava_static.a \ +# -Wl,--no-whole-archive \ +# -lrt -lpthread -ldl -lstdc++ -lm \ +# /usr/local/cuda/lib64/libcudart_static.a \ +# /usr/local/cuda/lib64/libcublas_static.a \ +# /usr/local/cuda/lib64/libcublasLt_static.a \ +# /usr/local/cuda/lib64/libcudadevrt.a \ +# /usr/local/cuda/lib64/libculibos.a + +if [ -z "${ROCM_PATH}" ] ; then + # Try the default location in case it exists + ROCM_PATH=/opt/rocm +fi + +if [ -z "${CLBlast_DIR}" ] ; then + # Try the default location in case it exists + if [ -d /usr/lib/cmake/CLBlast ]; then + export CLBlast_DIR=/usr/lib/cmake/CLBlast + fi +fi + +BUILD_DIR="gguf/build/rocm" +LIB_DIR="${BUILD_DIR}/lib" +mkdir -p ${LIB_DIR} +# Ensure we have at least one file present for the embed +touch ${LIB_DIR}/.generated + +if [ -d "${ROCM_PATH}" ] ; then + echo "Building ROCm" + init_vars + CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'" + CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + build + gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \ + -Wl,--whole-archive \ + ${BUILD_DIR}/examples/server/libext_server.a \ + ${BUILD_DIR}/common/libcommon.a \ + ${BUILD_DIR}/libllama.a \ + -Wl,--no-whole-archive \ + -lrt -lpthread -ldl -lstdc++ -lm \ + -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \ + -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \ + -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu +fi diff --git a/llm/llama.cpp/gen_windows.ps1 b/llm/llama.cpp/gen_windows.ps1 index 9717b2e7..f85f1a45 100644 --- a/llm/llama.cpp/gen_windows.ps1 +++ b/llm/llama.cpp/gen_windows.ps1 @@ -48,4 +48,8 @@ init_vars git_module_setup apply_patches build -install \ No newline at end of file +install + +# TODO - implement ROCm support on windows +md gguf/build/winrocm/lib -ea 0 +echo $null >> gguf/build/winrocm/lib/.generated diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index 6782a614..119b5c27 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -1,3 +1,3 @@ package llm -//go:generate sh ./gen_linux.sh +//go:generate bash ./gen_linux.sh diff --git a/llm/llama.cpp/generate_linux_cuda.go b/llm/llama.cpp/generate_linux_cuda.go deleted file mode 100644 index 86a95977..00000000 --- a/llm/llama.cpp/generate_linux_cuda.go +++ /dev/null @@ -1,24 +0,0 @@ -//go:build cuda - -package llm - -//go:generate git submodule init - -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch -//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch - -//go:generate rm -rf ggml/build/cuda -//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cuda --target server --config Release -//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner - -//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch - -//go:generate rm -rf gguf/build/cuda -//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -//go:generate cmake --build gguf/build/cuda --target server --config Release -//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner diff --git a/llm/llama.cpp/generate_linux_rocm.go b/llm/llama.cpp/generate_linux_rocm.go deleted file mode 100644 index 1766be84..00000000 --- a/llm/llama.cpp/generate_linux_rocm.go +++ /dev/null @@ -1,25 +0,0 @@ -//go:build rocm - -package llm - -//go:generate git submodule init - -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch -//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch - -//go:generate git submodule update --force gguf -//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch -//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch - -//go:generate rm -rf ggml/build/rocm -//go:generate cmake -S ggml -B ggml/build/rocm -DLLAMA_CLBLAST=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/rocm --target server --config Release -//go:generate mv ggml/build/rocm/bin/server ggml/build/rocm/bin/ollama-runner - -//go:generate rm -rf gguf/build/rocm -//go:generate cmake -S gguf -B gguf/build/rocm -DLLAMA_HIPBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -//go:generate cmake --build gguf/build/rocm --target server --config Release -//go:generate mv gguf/build/rocm/bin/server gguf/build/rocm/bin/ollama-runner diff --git a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch index 838347d5..623243d4 100644 --- a/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch +++ b/llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch @@ -1,15 +1,15 @@ -From 64b3fbb150d12b3ca63ac2fb4e57bc46f41d2ccd Mon Sep 17 00:00:00 2001 +From 087cf3300e973d7790db8f7cad01d2a790de38be Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 13 Nov 2023 12:25:58 -0800 Subject: [PATCH] Expose callable API for server This adds an extern "C" interface within the example server --- - examples/server/CMakeLists.txt | 24 ++++ - examples/server/server.cpp | 247 +++++++++++++++++++++++++++++++++ - examples/server/server.h | 83 +++++++++++ + examples/server/CMakeLists.txt | 24 +++ + examples/server/server.cpp | 274 +++++++++++++++++++++++++++++++++ + examples/server/server.h | 89 +++++++++++ ggml-cuda.cu | 1 + - 4 files changed, 355 insertions(+) + 4 files changed, 388 insertions(+) create mode 100644 examples/server/server.h diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt @@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644 +endif() \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 895f751..f939590 100644 +index d0cd8e1..5f5d4c5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -5,6 +5,9 @@ @@ -59,7 +59,7 @@ index 895f751..f939590 100644 #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error -@@ -2631,6 +2634,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con +@@ -2632,6 +2635,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } } @@ -67,84 +67,84 @@ index 895f751..f939590 100644 int main(int argc, char **argv) { // own arguments required by this example -@@ -3065,3 +3069,246 @@ int main(int argc, char **argv) +@@ -3066,3 +3070,273 @@ int main(int argc, char **argv) llama_backend_free(); return 0; } + +#else // LLAMA_SERVER_LIBRARY +// Expose the llama server as a callable extern "C" API -+llama_server_context llama; ++llama_server_context *llama = NULL; +std::atomic ext_server_running(false); +std::thread ext_server_thread; -+inline ext_server_err makeErr(uint32_t code, std::string msg) { -+ if (code == 0) { -+ return ext_server_err{0, NULL}; -+ } -+ const std::string::size_type size = msg.size(); -+ ext_server_err ret = { -+ code, -+ new char[size + 1], -+ }; -+ memcpy(ret.err, msg.c_str(), size + 1); -+ return ret; -+} + -+ext_server_err llama_server_init(ext_server_params *sparams) ++void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) +{ -+ log_set_target(stdout); -+ gpt_params params; -+ params.n_ctx = sparams->n_ctx; -+ params.n_batch = sparams->n_batch; -+ params.n_threads = sparams->n_threads; -+ params.n_parallel = sparams->n_parallel; -+ params.rope_freq_base = sparams->rope_freq_base; -+ params.rope_freq_scale = sparams->rope_freq_scale; -+ -+ if (sparams->memory_f16) { -+ params.cache_type_k = "f16"; -+ params.cache_type_v = "f16"; -+ } else { -+ params.cache_type_k = "f32"; -+ params.cache_type_v = "f32"; -+ } -+ -+ params.n_gpu_layers = sparams->n_gpu_layers; -+ params.main_gpu = sparams->main_gpu; -+ params.use_mlock = sparams->use_mlock; -+ params.use_mmap = sparams->use_mmap; -+ params.numa = sparams->numa; -+ params.embedding = sparams->embedding; -+ if (sparams->model != NULL) { -+ params.model = sparams->model; -+ } -+ -+ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) { -+ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale)); -+ } -+ ++ assert(err != NULL && sparams != NULL); ++ err->id = 0; ++ err->msg[0] = '\0'; + try { ++ llama = new llama_server_context; ++ log_set_target(stdout); ++ gpt_params params; ++ params.n_ctx = sparams->n_ctx; ++ params.n_batch = sparams->n_batch; ++ params.n_threads = sparams->n_threads; ++ params.n_parallel = sparams->n_parallel; ++ params.rope_freq_base = sparams->rope_freq_base; ++ params.rope_freq_scale = sparams->rope_freq_scale; ++ ++ if (sparams->memory_f16) { ++ params.cache_type_k = "f16"; ++ params.cache_type_v = "f16"; ++ } else { ++ params.cache_type_k = "f32"; ++ params.cache_type_v = "f32"; ++ } ++ ++ params.n_gpu_layers = sparams->n_gpu_layers; ++ params.main_gpu = sparams->main_gpu; ++ params.use_mlock = sparams->use_mlock; ++ params.use_mmap = sparams->use_mmap; ++ params.numa = sparams->numa; ++ params.embedding = sparams->embedding; ++ if (sparams->model != NULL) { ++ params.model = sparams->model; ++ } ++ ++ for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL; la = la->next) { ++ params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale)); ++ } ++ ++ if (sparams->mmproj != NULL) { ++ params.mmproj = std::string(sparams->mmproj); ++ } ++ + llama_backend_init(params.numa); + + // load the model -+ if (!llama.load_model(params)) ++ if (!llama->load_model(params)) + { + // TODO - consider modifying the logging logic or patching load_model so we can capture more detailed error messages + // and pass them back to the caller for better UX -+ return makeErr(1, "error loading model " + params.model); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str()); ++ return; + } + -+ llama.initialize(); ++ llama->initialize(); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception initializing llama server"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception initializing llama server"); + } -+ return makeErr(0, ""); +} + +void llama_server_start() +{ ++ assert(llama != NULL); + // TODO mutex to protect thread creation + ext_server_thread = std::thread([&]() + { @@ -154,7 +154,7 @@ index 895f751..f939590 100644 + ggml_time_init(); + while (ext_server_running.load()) + { -+ if (!llama.update_slots()) { ++ if (!llama->update_slots()) { + LOG_TEE("unexpected error in llama server update_slots - exiting main loop\n"); + break; + } @@ -170,124 +170,150 @@ index 895f751..f939590 100644 +} + +void llama_server_stop() { ++ assert(llama != NULL); + // TODO - too verbose, remove once things are solid + LOG_TEE("requesting llama server shutdown\n"); + ext_server_running = false; + ext_server_thread.join(); ++ delete llama; ++ llama = NULL; + LOG_TEE("llama server shutdown complete\n"); +} + -+ext_server_completion_resp llama_server_completion(const char *json_req) { -+ std::string msg; -+ ext_server_completion_resp resp = { -+ 0, -+ NULL, -+ }; ++void llama_server_completion(const char *json_req, ext_server_resp_t *resp) { ++ assert(llama != NULL && json_req != NULL && resp != NULL); ++ resp->id = -1; ++ resp->msg[0] = '\0'; + try { + json data = json::parse(json_req); -+ resp.task_id = llama.request_completion(data, false, false, -1); -+ return resp; ++ resp->id = llama->request_completion(data, false, false, -1); + } catch (std::exception &e) { -+ msg = e.what(); ++ snprintf(resp->msg, resp->msg_len, "exception %s", e.what()); + } catch (...) { -+ msg = "Unknown Exception during completion"; ++ snprintf(resp->msg, resp->msg_len, "Unknown exception during completion"); + } -+ const std::string::size_type size = msg.size(); -+ resp.task_id = 0; -+ resp.err = new char[size + 1]; -+ memcpy(resp.err, msg.c_str(), size + 1); -+ return resp; +} + -+ext_task_result llama_server_completion_next_result(const int task_id) { ++void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *resp) { ++ assert(llama != NULL && resp != NULL); + std::string msg; -+ ext_task_result resp = {-1,false,false,NULL}; -+ try { -+ task_result result = llama.next_result(task_id); -+ std::string result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); -+ const std::string::size_type size = result_json.size(); -+ resp.id = result.id; -+ resp.stop = result.stop; -+ resp.error = result.error; -+ resp.result_json = new char[size + 1]; -+ memcpy(resp.result_json, result_json.c_str(), size + 1); -+ if (result.error) { -+ llama.request_cancel(task_id); -+ } else if (result.stop) { -+ llama.request_cancel(task_id); -+ } -+ return resp; -+ } catch (std::exception &e) { -+ msg = e.what(); // TODO - json? -+ } catch (...) { -+ msg = "Unknown Exception during completion"; -+ } -+ resp.error = true; -+ const std::string::size_type size = msg.size(); -+ resp.result_json = new char[size + 1]; -+ memcpy(resp.result_json, msg.c_str(), size + 1); -+ return resp; -+} -+ -+ext_server_err llama_server_completion_cancel(const int task_id) { -+ try { -+ llama.request_cancel(task_id); -+ } catch (std::exception &e) { -+ return makeErr(1, e.what()); -+ } catch (...) { -+ return makeErr(1, "Unknown Exception running llama server"); -+ } -+ return makeErr(0, ""); -+} -+ -+ -+ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp) { ++ resp->id = -1; ++ resp->stop = false; ++ resp->error = false; + resp->json_resp = NULL; ++ std::string result_json; ++ try { ++ task_result result = llama->next_result(task_id); ++ result_json = result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); ++ resp->id = result.id; ++ resp->stop = result.stop; ++ resp->error = result.error; ++ if (result.error) { ++ llama->request_cancel(task_id); ++ } else if (result.stop) { ++ llama->request_cancel(task_id); ++ } ++ } catch (std::exception &e) { ++ resp->error = true; ++ resp->id = -1; ++ result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}"; ++ } catch (...) { ++ resp->error = true; ++ resp->id = -1; ++ result_json = "{\"error\":\"Unknown exception during completion\"}"; ++ } ++ const std::string::size_type size = result_json.size() + 1; ++ resp->json_resp = new char[size]; ++ snprintf(resp->json_resp, size, "%s", result_json.c_str()); ++} ++ ++void llama_server_release_task_result(ext_server_task_result_t *result) { ++ if (result == NULL || result->json_resp == NULL) { ++ return; ++ } ++ delete[] result->json_resp; ++} ++ ++void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) { ++ assert(llama != NULL && err != NULL); ++ err->id = 0; ++ err->msg[0] = '\0'; ++ try { ++ llama->request_cancel(task_id); ++ } catch (std::exception &e) { ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); ++ } catch (...) { ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception completion cancel in llama server"); ++ } ++} ++ ++void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) { ++ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); ++ *json_resp = NULL; ++ err->id = 0; ++ err->msg[0] = '\0'; + try { + const json body = json::parse(json_req); + std::vector tokens; + if (body.count("content") != 0) + { -+ tokens = llama.tokenize(body["content"], false); ++ tokens = llama->tokenize(body["content"], false); + } + const json data = format_tokenizer_response(tokens); + std::string result_json = data.dump(); -+ const std::string::size_type size = result_json.size(); -+ resp->json_resp = new char[size + 1]; -+ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ const std::string::size_type size = result_json.size() + 1; ++ *json_resp = new char[size]; ++ snprintf(*json_resp, size, "%s", result_json.c_str()); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception during tokenize"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception during tokenize"); + } -+ return makeErr(0, ""); +} + -+ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp) { -+ resp->json_resp = NULL; ++void llama_server_release_json_resp(char **json_resp) { ++ if (json_resp == NULL || *json_resp == NULL) { ++ return; ++ } ++ delete[] *json_resp; ++} ++ ++void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err) { ++ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); ++ *json_resp = NULL; ++ err->id = 0; ++ err->msg[0] = '\0'; + try { + const json body = json::parse(json_req); + std::string content; + if (body.count("tokens") != 0) + { + const std::vector tokens = body["tokens"]; -+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); ++ content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend()); + } + const json data = format_detokenized_response(content); + std::string result_json = data.dump(); -+ const std::string::size_type size = result_json.size(); -+ resp->json_resp = new char[size + 1]; -+ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ const std::string::size_type size = result_json.size() + 1; ++ *json_resp = new char[size]; ++ snprintf(*json_resp, size, "%s", result_json.c_str()); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception during detokenize"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception during detokenize"); + } -+ return makeErr(0, ""); +} + -+ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp) { -+ resp->json_resp = NULL; ++void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err) { ++ assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL); ++ *json_resp = NULL; ++ err->id = 0; ++ err->msg[0] = '\0'; + try { + const json body = json::parse(json_req); + json prompt; @@ -299,28 +325,29 @@ index 895f751..f939590 100644 + { + prompt = ""; + } -+ const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); -+ task_result result = llama.next_result(task_id); ++ const int task_id = llama->request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true, -1); ++ task_result result = llama->next_result(task_id); + std::string result_json = result.result_json.dump(); -+ const std::string::size_type size = result_json.size(); -+ resp->json_resp = new char[size + 1]; -+ memcpy(resp->json_resp, result_json.c_str(), size + 1); ++ const std::string::size_type size = result_json.size() + 1; ++ *json_resp = new char[size]; ++ snprintf(*json_resp, size, "%s", result_json.c_str()); + } catch (std::exception &e) { -+ return makeErr(1, e.what()); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "exception %s", e.what()); + } catch (...) { -+ return makeErr(1, "Unknown Exception during detokenize"); ++ err->id = -1; ++ snprintf(err->msg, err->msg_len, "Unknown exception during embedding"); + } -+ return makeErr(0, ""); +} + +#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/examples/server/server.h b/examples/server/server.h new file mode 100644 -index 0000000..4d03b1e +index 0000000..d22f1b6 --- /dev/null +++ b/examples/server/server.h -@@ -0,0 +1,83 @@ +@@ -0,0 +1,89 @@ +#if defined(LLAMA_SERVER_LIBRARY) +#ifndef LLAMA_SERVER_H +#define LLAMA_SERVER_H @@ -336,20 +363,23 @@ index 0000000..4d03b1e +extern "C" +{ +#endif -+ // TODO - clean the type def's up a bit for better consistency -+ typedef struct ext_server_err { -+ uint32_t code; // 0 on success, > 0 on error -+ char *err; // null if code == 0; else contains error message. Caller responsible for freeing memory -+ } ext_server_err; ++ typedef struct ext_server_resp { ++ int id; // < 0 on error ++ size_t msg_len; // caller must allocate msg and set msg_len ++ char *msg; ++ } ext_server_resp_t; + ++ // Allocated and freed by caller + typedef struct ext_server_lora_adapter { + char *adapter; + float scale; + struct ext_server_lora_adapter *next; -+ } ext_server_lora_adapter; ++ } ext_server_lora_adapter_t; ++ ++ // Allocated and freed by caller + typedef struct ext_server_params + { -+ char *model; ++ char *model; + uint32_t n_ctx; // text context, 0 = from model + uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_threads; // number of threads to use for generation @@ -363,40 +393,43 @@ index 0000000..4d03b1e + bool use_mmap; // use mmap if possible + bool numa; // attempt optimizations that help on some NUMA systems + bool embedding; // get only sentence embedding -+ ext_server_lora_adapter* lora_adapters; -+ } ext_server_params; ++ ext_server_lora_adapter_t* lora_adapters; ++ char *mmproj; ++ } ext_server_params_t; + -+ // Initialize the server once per process -+ ext_server_err llama_server_init(ext_server_params *sparams); -+ -+ // Run the main loop -+ void llama_server_start(); -+ // Stop the main loop -+ void llama_server_stop(); -+ -+ typedef struct ext_task_result ++ typedef struct ext_server_task_result + { + int id; + bool stop; + bool error; -+ char* result_json; // caller responsible to free this memory -+ } ext_task_result; -+ -+ typedef struct ext_server_completion_resp { -+ int task_id; // < 0 on error, >= 0 on success -+ char *err; // null if task_id >= 0; else contains error message. Caller responsible for freeing memory -+ } ext_server_completion_resp; -+ ext_server_completion_resp llama_server_completion(const char *json_req); -+ ext_task_result llama_server_completion_next_result(const int task_id); -+ ext_server_err llama_server_completion_cancel(const int task_id); ++ char* json_resp; // null terminated, memory managed by ext_server ++ } ext_server_task_result_t; + -+ // Caller responsible for freeing json_resp -+ typedef struct ext_server_resp { -+ char *json_resp; // Caller responsible for freeing string -+ } ext_server_resp; -+ ext_server_err llama_server_tokenize(const char *json_req, ext_server_resp *resp); -+ ext_server_err llama_server_detokenize(const char *json_req, ext_server_resp *resp); -+ ext_server_err llama_server_embedding(const char *json_req, ext_server_resp *resp); ++ // Initialize the server once per process ++ // err->id = 0 for success and err->msg[0] = NULL ++ // err->id != 0 for failure, and err->msg contains error message ++ void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err); ++ ++ // Run the main loop, called once per init ++ void llama_server_start(); ++ // Stop the main loop and free up resources allocated in init and start. Init must be called again to reuse ++ void llama_server_stop(); ++ ++ // json_req null terminated string, memory managed by caller ++ // resp->id >= 0 on success (task ID) ++ // resp->id < 0 on error, and resp->msg contains error message ++ void llama_server_completion(const char *json_req, ext_server_resp_t *resp); ++ ++ // Caller must call llama_server_release_task_result to free resp->json_resp ++ void llama_server_completion_next_result(const int task_id, ext_server_task_result_t *result); ++ void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err); ++ void llama_server_release_task_result(ext_server_task_result_t *result); ++ ++ // Caller must call llama_server_releaes_json_resp to free json_resp if err.id < 0 ++ void llama_server_tokenize(const char *json_req, char **json_resp, ext_server_resp_t *err); ++ void llama_server_detokenize(const char *json_req, char **json_resp, ext_server_resp_t *err); ++ void llama_server_embedding(const char *json_req, char** json_resp, ext_server_resp_t *err); ++ void llama_server_release_json_resp(char **json_resp); + +#ifdef __cplusplus +} @@ -406,10 +439,10 @@ index 0000000..4d03b1e +#endif // LLAMA_SERVER_LIBRARY \ No newline at end of file diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 85f7a29..ce51364 100644 +index 9e1acd3..ea64b55 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu -@@ -6410,6 +6410,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( +@@ -6505,6 +6505,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( CUDA_CHECK(cudaGetDevice(&id)); src_ptr = (char *) extra->data_device[id]; } else { diff --git a/llm/llama.go b/llm/llama.go index b3c57d47..26a0d588 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -3,6 +3,7 @@ package llm import ( "bytes" "context" + _ "embed" "errors" "fmt" "os" @@ -112,12 +113,6 @@ type ImageData struct { ID int `json:"id"` } -type llama struct { - api.Options - ImageData []ImageData - Running -} - var ( errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed") errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only") @@ -166,7 +161,8 @@ type prediction struct { } const maxBufferSize = 512 * format.KiloByte -const maxRetries = 6 +const maxRetries = 3 +const retryDelay = 1 * time.Second type PredictOpts struct { Prompt string diff --git a/llm/llm.go b/llm/llm.go index 41724d35..86dd3346 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -11,6 +11,7 @@ import ( "github.com/jmorganca/ollama/api" "github.com/jmorganca/ollama/format" + "github.com/jmorganca/ollama/gpu" ) type LLM interface { @@ -19,7 +20,6 @@ type LLM interface { Encode(context.Context, string) ([]int, error) Decode(context.Context, []int) (string, error) Close() - Ping(context.Context) error } func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) { @@ -78,5 +78,17 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.NumGQA = 0 opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 - return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts) + gpuInfo := gpu.GetGPUInfo() + switch gpuInfo.Driver { + case "ROCM": + return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts) + default: + // Rely on the built-in CUDA based server which will fall back to CPU + return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts) + } +} + +// Give any native cgo implementations an opportunity to initialize +func Init(workdir string) error { + return nativeInit(workdir) } diff --git a/llm/rocm_shim.c b/llm/rocm_shim.c new file mode 100644 index 00000000..9a6595b1 --- /dev/null +++ b/llm/rocm_shim.c @@ -0,0 +1,134 @@ +#include "rocm_shim.h" + +#include +#include + +#ifndef _WIN32 +#include +#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) +#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) +#define LOAD_ERR() dlerror() +#define UNLOAD_LIBRARY(handle) dlclose(handle) +#else +#include +#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib) +#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym) +#define UNLOAD_LIBRARY(handle) FreeLibrary(handle) +// TODO - refactor this with proper error message handling on windows +inline static char *LOAD_ERR() { + static char errbuf[8]; + snprintf(errbuf, 8, "0x%lx", GetLastError()); + return errbuf; +} +#endif + +void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, + ext_server_resp_t *err) { + int i = 0; + struct lookup { + char *s; + void **p; + } l[] = { + {"llama_server_init", (void *)&s->llama_server_init}, + {"llama_server_start", (void *)&s->llama_server_start}, + {"llama_server_stop", (void *)&s->llama_server_stop}, + {"llama_server_completion", (void *)&s->llama_server_completion}, + {"llama_server_completion_next_result", + (void *)&s->llama_server_completion_next_result}, + {"llama_server_completion_cancel", + (void *)&s->llama_server_completion_cancel}, + {"llama_server_release_task_result", + (void *)&s->llama_server_release_task_result}, + {"llama_server_tokenize", (void *)&s->llama_server_tokenize}, + {"llama_server_detokenize", (void *)&s->llama_server_detokenize}, + {"llama_server_embedding", (void *)&s->llama_server_embedding}, + {"llama_server_release_json_resp", + (void *)&s->llama_server_release_json_resp}, + {"", NULL}, + }; + + printf("Lazy loading %s library\n", libPath); + s->handle = LOAD_LIBRARY(libPath, RTLD_LAZY); + if (!s->handle) { + err->id = -1; + snprintf( + err->msg, err->msg_len, + "Unable to load rocm server library: %s (If you have a Radeon card, " + "did you install the ROCM libraries?)", + LOAD_ERR()); + return; + } + + for (i = 0; l[i].p != NULL; i++) { + *l[i].p = LOAD_SYMBOL(s->handle, l[i].s); + if (!l[i].p) { + UNLOAD_LIBRARY(s->handle); + err->id = -1; + snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s", + l[i].s, LOAD_ERR()); + return; + } + } +} + +inline void rocm_shim_llama_server_init(struct rocm_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err) { + s.llama_server_init(sparams, err); +} + +inline void rocm_shim_llama_server_start(struct rocm_llama_server s) { + s.llama_server_start(); +} + +inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) { + s.llama_server_stop(); +} + +inline void rocm_shim_llama_server_completion(struct rocm_llama_server s, + const char *json_req, + ext_server_resp_t *resp) { + s.llama_server_completion(json_req, resp); +} + +inline void rocm_shim_llama_server_completion_next_result( + struct rocm_llama_server s, const int task_id, + ext_server_task_result_t *result) { + s.llama_server_completion_next_result(task_id, result); +} + +inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, + const int task_id, + ext_server_resp_t *err) { + s.llama_server_completion_cancel(task_id, err); +} +inline void rocm_shim_llama_server_release_task_result( + struct rocm_llama_server s, ext_server_task_result_t *result) { + s.llama_server_release_task_result(result); +} + +inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { + s.llama_server_tokenize(json_req, json_resp, err); +} + +inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { + s.llama_server_detokenize(json_req, json_resp, err); +} + +inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s, + const char *json_req, + char **json_resp, + ext_server_resp_t *err) { + s.llama_server_embedding(json_req, json_resp, err); +} + +inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, + char **json_resp) { + s.llama_server_release_json_resp(json_resp); +} diff --git a/llm/rocm_shim.h b/llm/rocm_shim.h new file mode 100644 index 00000000..d11ed991 --- /dev/null +++ b/llm/rocm_shim.h @@ -0,0 +1,73 @@ +#include + +#include "server.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct rocm_llama_server { + void *handle; + void (*llama_server_init)(ext_server_params_t *sparams, + ext_server_resp_t *err); + void (*llama_server_start)(); + void (*llama_server_stop)(); + void (*llama_server_completion)(const char *json_req, + ext_server_resp_t *resp); + void (*llama_server_completion_next_result)(const int task_id, + ext_server_task_result_t *result); + void (*llama_server_completion_cancel)(const int task_id, + ext_server_resp_t *err); + void (*llama_server_release_task_result)(ext_server_task_result_t *result); + void (*llama_server_tokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_detokenize)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_embedding)(const char *json_req, char **json_resp, + ext_server_resp_t *err); + void (*llama_server_release_json_resp)(char **json_resp); +}; + +void rocm_shim_init(const char *libPath, struct rocm_llama_server *s, + ext_server_resp_t *err); + +// No good way to call C function pointers from Go so inline the indirection +void rocm_shim_llama_server_init(struct rocm_llama_server s, + ext_server_params_t *sparams, + ext_server_resp_t *err); + +void rocm_shim_llama_server_start(struct rocm_llama_server s); + +void rocm_shim_llama_server_stop(struct rocm_llama_server s); + +void rocm_shim_llama_server_completion(struct rocm_llama_server s, + const char *json_req, + ext_server_resp_t *resp); + +void rocm_shim_llama_server_completion_next_result( + struct rocm_llama_server s, const int task_id, + ext_server_task_result_t *result); + +void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s, + const int task_id, + ext_server_resp_t *err); + +void rocm_shim_llama_server_release_task_result( + struct rocm_llama_server s, ext_server_task_result_t *result); + +void rocm_shim_llama_server_tokenize(struct rocm_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); + +void rocm_shim_llama_server_detokenize(struct rocm_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); + +void rocm_shim_llama_server_embedding(struct rocm_llama_server s, + const char *json_req, char **json_resp, + ext_server_resp_t *err); +void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s, + char **json_resp); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go new file mode 100644 index 00000000..adf02108 --- /dev/null +++ b/llm/shim_darwin.go @@ -0,0 +1,18 @@ +package llm + +import ( + "fmt" + + "github.com/jmorganca/ollama/api" +) + +// no-op stubs for mac + +func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + // should never happen... + return nil, fmt.Errorf("ROCM GPUs not supported on Mac") +} + +func nativeInit(workDir string) error { + return nil +} diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go new file mode 100644 index 00000000..0e7bcfae --- /dev/null +++ b/llm/shim_ext_server.go @@ -0,0 +1,212 @@ +//go:build !darwin + +package llm + +/* + +#include +#include "rocm_shim.h" + +*/ +import "C" +import ( + "context" + "embed" + "errors" + "fmt" + "io" + "io/fs" + "log" + "os" + "path/filepath" + "runtime" + "sync" + "unsafe" + + "github.com/jmorganca/ollama/api" +) + +//go:embed llama.cpp/gguf/build/*/lib/* +var libEmbed embed.FS + +var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported") +var NoShim = true + +type shimExtServer struct { + s C.struct_rocm_llama_server + options api.Options +} + +// Note: current implementation does not support concurrent instantiations +var shimMutex sync.Mutex +var llm *shimExtServer + +func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_init(llm.s, sparams, err) +} +func (llm *shimExtServer) llama_server_start() { + C.rocm_shim_llama_server_start(llm.s) +} +func (llm *shimExtServer) llama_server_stop() { + C.rocm_shim_llama_server_stop(llm.s) +} + +func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) { + C.rocm_shim_llama_server_completion(llm.s, json_req, resp) +} +func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) { + C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp) +} +func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err) +} +func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) { + C.rocm_shim_llama_server_release_task_result(llm.s, result) +} + +func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err) +} +func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err) +} +func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) { + C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err) +} +func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { + C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp) +} + +func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { + if NoShim { + return nil, RocmShimMissing + } + log.Printf("Loading ROCM llm server") + if llm == nil { + return nil, fmt.Errorf("nativeInit wasnt called or libary load failed") + } + llm.options = opts + return newExtServer(llm, model, adapters, projectors, numLayers, opts) +} + +func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error { + return predict(llm, llm.options, ctx, pred, fn) +} + +func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) { + return encode(llm, ctx, prompt) +} + +func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) { + return decode(llm, ctx, tokens) +} + +func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) { + return embedding(llm, ctx, input) +} + +func (llm *shimExtServer) Close() { + close(llm) +} + +func nativeInit(workdir string) error { + err := extractLib(workdir) + if err != nil { + if err == RocmShimMissing { + log.Printf("%s", err) + return nil + } + return err + } + + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + log.Fatalf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.") + return err + } else if errors.Is(err, fs.ErrNotExist) { + // expected behavior without a radeon card + return nil + } + + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) + } + fd.Close() + + shimMutex.Lock() + defer shimMutex.Unlock() + if llm != nil { + return nil + } + var libName string + switch runtime.GOOS { + case "darwin": + // shouldn't happen + return nil + case "linux": + libName = "librocm_server.so" + case "windows": + libName = "rocm_server.dll" + default: + // shouldn't happen + return nil + } + libPath := C.CString(filepath.Join(workdir, libName)) + defer C.free(unsafe.Pointer(libPath)) + resp := newExtServerResp(128) + defer freeExtServerResp(resp) + var srv C.struct_rocm_llama_server + C.rocm_shim_init(libPath, &srv, &resp) + if resp.id < 0 { + // TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm + // and run against CPU + return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg)) + } + llm = &shimExtServer{ + s: srv, + options: api.DefaultOptions(), + } + return nil +} + +func extractLib(workDir string) error { + files, err := fs.Glob(libEmbed, "llama.cpp/gguf/build/*/lib/*rocm_server*") + if err != nil || len(files) == 0 { + // this is expected, ollama may be compiled without shim library packed in + return RocmShimMissing + } + + if len(files) != 1 { + // Shouldn't happen, but just use the first one we find + log.Printf("WARNING: multiple rocm libraries detected - using %s", files[0]) + } + + srcFile, err := libEmbed.Open(files[0]) + if err != nil { + return fmt.Errorf("read ROCm shim %s: %v", files[0], err) + } + defer srcFile.Close() + if err := os.MkdirAll(workDir, 0o755); err != nil { + return fmt.Errorf("create ROCm shim temp dir %s: %v", workDir, err) + } + + destFile := filepath.Join(workDir, filepath.Base(files[0])) + + _, err = os.Stat(destFile) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write ROCm shim %s: %v", files[0], err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, srcFile); err != nil { + return fmt.Errorf("copy ROCm shim %s: %v", files[0], err) + } + case err != nil: + return fmt.Errorf("stat ROCm shim %s: %v", files[0], err) + } + NoShim = false + return nil +} diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 20b44bf7..06a2ae1c 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -8,7 +8,7 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version mkdir -p dist for TARGETARCH in amd64 arm64; do - docker buildx build --load --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH . + docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH docker rm builder-$TARGETARCH diff --git a/scripts/build_remote.py b/scripts/build_remote.py new file mode 100755 index 00000000..db824e4b --- /dev/null +++ b/scripts/build_remote.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +import subprocess +import sys +from urllib.parse import urlparse +from git import Repo + +# Helper script to be able to build on remote repos using git to push local changes +# (e.g. particularly helpful to target a remote windows build system) +# +# Typical windows remote git config looks like this: +# +#[remote "windows-pa"] +# url = jdoe@desktop-foo:C:/Users/Jdoe/code/ollama +# fetch = +refs/heads/*:refs/remotes/windows-pa/* +# uploadpack = powershell git upload-pack +# receivepack = powershell git receive-pack +# + +# TODO - add argpare and make this more configurable +# - force flag becomes optional +# - generate, build or test ... + +# Note: remote repo will need this run once: +# git config --local receive.denyCurrentBranch updateInstead +repo = Repo(".") + +# On linux, add links in /usr/local/bin to the go binaries to avoid needing this +# GoCmd = "/usr/local/go/bin/go" +GoCmd = "go" + +if repo.is_dirty(): + print("Tree is dirty. Commit your changes before running this script") + sys.exit(1) + +if len(sys.argv) != 2: + print("Please specify the remote name: " + ', '.join([r.name for r in repo.remotes])) + sys.exit(1) +remote_name = sys.argv[1] + +remote = {r.name: r for r in repo.remotes}[remote_name] +raw_url = list(remote.urls)[0] +url = urlparse(raw_url) +# Windows urls don't quite parse properly +if url.scheme == "" and url.netloc == "": + url = urlparse("ssh://" + raw_url) +print("URL: " + str(url)) +netloc = url.netloc.split(":")[0] +path = url.path +branch_name = repo.active_branch.name + +print("Force pushing content to remote...") +# Use with care given the force push +remote.push(force=True).raise_if_error() + +print("Ensuring correct branch checked out on remote via ssh...") +subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'git', 'checkout', branch_name]) + + +# TODO - add some hardening to try to figure out how to set up the path properly +# subprocess.check_call(['ssh', netloc, 'cd', path, ';', 'env']) +# TODO - or consider paramiko maybe + +print("Performing generate") +subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './...']) + +print("Building") +subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.']) + diff --git a/scripts/setup_integration_tests.sh b/scripts/setup_integration_tests.sh index a1d01ac1..a8651bc0 100755 --- a/scripts/setup_integration_tests.sh +++ b/scripts/setup_integration_tests.sh @@ -32,4 +32,4 @@ for LAYER in $(cat ${OLLAMA_MODELS}/manifests/${REGISTRY}/${TEST_MODEL}/${TEST_M curl -L -C - --header "${ACCEPT_HEADER}" \ -o ${OLLAMA_MODELS}/blobs/${LAYER} \ ${REGISTRY_SCHEME}://${REGISTRY}/v2/${TEST_MODEL}/blobs/${LAYER} -done \ No newline at end of file +done diff --git a/server/llm_test.go b/server/llm_test.go index 167c5831..ad0823f6 100644 --- a/server/llm_test.go +++ b/server/llm_test.go @@ -2,14 +2,17 @@ package server import ( "context" + "os" "strings" "sync" "testing" "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/llm" ) // TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server @@ -33,12 +36,16 @@ var ( } resp = [2]string{ "once upon a time", - "fourth thursday", + "united states thanksgiving", } ) func TestIntegrationSimpleOrcaMini(t *testing.T) { SkipIFNoTestData(t) + workDir, err := os.MkdirTemp("", "ollama") + require.NoError(t, err) + defer os.RemoveAll(workDir) + require.NoError(t, llm.Init(workDir)) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) defer cancel() opts := api.DefaultOptions() @@ -56,7 +63,13 @@ func TestIntegrationSimpleOrcaMini(t *testing.T) { // get true concurrency working with n_parallel support in the backend func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { SkipIFNoTestData(t) + t.Skip("concurrent prediction on single runner not currently supported") + + workDir, err := os.MkdirTemp("", "ollama") + require.NoError(t, err) + defer os.RemoveAll(workDir) + require.NoError(t, llm.Init(workDir)) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) defer cancel() opts := api.DefaultOptions() @@ -79,6 +92,10 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) { SkipIFNoTestData(t) + workDir, err := os.MkdirTemp("", "ollama") + require.NoError(t, err) + defer os.RemoveAll(workDir) + require.NoError(t, llm.Init(workDir)) ctx, cancel := context.WithTimeout(context.Background(), time.Second*60) defer cancel() opts := api.DefaultOptions() @@ -87,6 +104,7 @@ func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) { var wg sync.WaitGroup wg.Add(len(req)) + t.Logf("Running %d concurrently", len(req)) for i := 0; i < len(req); i++ { go func(i int) { defer wg.Done() diff --git a/server/routes.go b/server/routes.go index 26a02cc1..75e67a72 100644 --- a/server/routes.go +++ b/server/routes.go @@ -25,6 +25,7 @@ import ( "github.com/gin-gonic/gin" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/gpu" "github.com/jmorganca/ollama/llm" "github.com/jmorganca/ollama/parser" "github.com/jmorganca/ollama/version" @@ -81,20 +82,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess return nil, err } - ctx := c.Request.Context() - - // check if the loaded model is still running in a subprocess, in case something unexpected happened - if loaded.runner != nil { - if err := loaded.runner.Ping(ctx); err != nil { - log.Print("loaded llm process not responding, closing now") - // the subprocess is no longer running, so close it - loaded.runner.Close() - loaded.runner = nil - loaded.Model = nil - loaded.Options = nil - } - } - needLoad := loaded.runner == nil || // is there a model loaded? loaded.ModelPath != model.ModelPath || // has the base model changed? !reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed? @@ -905,9 +892,12 @@ func Serve(ln net.Listener) error { os.Exit(0) }() - if runtime.GOOS == "linux" { + if err := llm.Init(s.WorkDir); err != nil { + return fmt.Errorf("unable to initialize llm library %w", err) + } + if runtime.GOOS == "linux" { // TODO - windows too // check compatibility to log warnings - if _, err := llm.CheckVRAM(); err != nil { + if _, err := gpu.CheckVRAM(); err != nil { log.Print(err.Error()) } }