From 1b249748abdf6905edb4a94c6a367d6cbbc4d00a Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 12 Jan 2024 16:28:00 -0800 Subject: [PATCH] Add multiple CPU variants for Intel Mac This also refines the build process for the ext_server build. --- .github/workflows/test.yaml | 3 ++ Dockerfile.build | 10 +++- llm/dyn_ext_server.c | 6 +-- llm/dyn_ext_server.go | 11 +---- llm/ext_server/CMakeLists.txt | 24 ++++----- llm/generate/gen_common.sh | 73 +++++++++++++++++++-------- llm/generate/gen_darwin.sh | 65 ++++++++++++++++--------- llm/generate/gen_linux.sh | 87 +++++++++++++++------------------ llm/generate/gen_windows.ps1 | 70 ++++++++++++++++++++------ llm/payload_common.go | 92 ++++++++++++++++++++++------------- llm/payload_darwin.go | 8 --- llm/payload_darwin_amd64.go | 8 +++ llm/payload_darwin_arm64.go | 8 +++ llm/payload_linux.go | 2 +- llm/payload_windows.go | 2 +- scripts/build_darwin.sh | 31 +++++++++--- scripts/build_remote.py | 4 ++ scripts/rh_linux_deps.sh | 1 + 18 files changed, 320 insertions(+), 185 deletions(-) delete mode 100644 llm/payload_darwin.go create mode 100644 llm/payload_darwin_amd64.go create mode 100644 llm/payload_darwin_arm64.go diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 441a66e2..048d92af 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -86,6 +86,9 @@ jobs: - os: windows-latest arch: arm64 runs-on: ${{ matrix.os }} + env: + GOARCH: ${{ matrix.arch }} + CGO_ENABLED: "1" steps: - uses: actions/checkout@v4 with: diff --git a/Dockerfile.build b/Dockerfile.build index 96b06138..6b52322b 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -10,6 +10,7 @@ COPY llm llm FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64 ARG CMAKE_VERSION +ARG CGO_CFLAGS COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH @@ -19,6 +20,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 ARG CMAKE_VERSION +ARG CGO_CFLAGS COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH @@ -28,6 +30,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete AS rocm-5-build-amd64 ARG CMAKE_VERSION +ARG CGO_CFLAGS COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH @@ -38,6 +41,7 @@ RUN OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete AS rocm-6-build-amd64 ARG CMAKE_VERSION +ARG CGO_CFLAGS COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH @@ -50,6 +54,7 @@ FROM --platform=linux/amd64 centos:7 AS cpu-build-amd64 ARG CMAKE_VERSION ARG GOLANG_VERSION ARG OLLAMA_CUSTOM_CPU_DEFS +ARG CGO_CFLAGS COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH @@ -61,6 +66,7 @@ FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64 ARG CMAKE_VERSION ARG GOLANG_VERSION ARG OLLAMA_CUSTOM_CPU_DEFS +ARG CGO_CFLAGS COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH @@ -72,7 +78,7 @@ RUN sh gen_linux.sh FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64 ENV CGO_ENABLED 1 ARG GOFLAGS -ARG CGO_FLAGS +ARG CGO_CFLAGS WORKDIR /go/src/github.com/jmorganca/ollama COPY . . COPY --from=cuda-build-amd64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ @@ -84,7 +90,7 @@ FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 ENV CGO_ENABLED 1 ARG GOLANG_VERSION ARG GOFLAGS -ARG CGO_FLAGS +ARG CGO_CFLAGS WORKDIR /go/src/github.com/jmorganca/ollama COPY . . COPY --from=cuda-build-arm64 /go/src/github.com/jmorganca/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/ diff --git a/llm/dyn_ext_server.c b/llm/dyn_ext_server.c index 111e4ab5..e9aa3481 100644 --- a/llm/dyn_ext_server.c +++ b/llm/dyn_ext_server.c @@ -5,7 +5,7 @@ #ifdef __linux__ #include -#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags | RTLD_DEEPBIND) +#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags) #define LOAD_SYMBOL(handle, sym) dlsym(handle, sym) #define LOAD_ERR() strdup(dlerror()) #define UNLOAD_LIBRARY(handle) dlclose(handle) @@ -58,8 +58,8 @@ void dyn_init(const char *libPath, struct dynamic_llama_server *s, {"", NULL}, }; - printf("loading %s library\n", libPath); - s->handle = LOAD_LIBRARY(libPath, RTLD_NOW); + printf("loading library %s\n", libPath); + s->handle = LOAD_LIBRARY(libPath, RTLD_GLOBAL|RTLD_NOW); if (!s->handle) { err->id = -1; char *msg = LOAD_ERR(); diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go index fa0d7750..496dcf7c 100644 --- a/llm/dyn_ext_server.go +++ b/llm/dyn_ext_server.go @@ -372,15 +372,6 @@ func updatePath(dir string) { newPath := strings.Join(append([]string{dir}, pathComponents...), ";") log.Printf("Updating PATH to %s", newPath) os.Setenv("PATH", newPath) - } else { - pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":") - for _, comp := range pathComponents { - if comp == dir { - return - } - } - newPath := strings.Join(append([]string{dir}, pathComponents...), ":") - log.Printf("Updating LD_LIBRARY_PATH to %s", newPath) - os.Setenv("LD_LIBRARY_PATH", newPath) } + // linux and darwin rely on rpath } diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index 2450c2d5..dd1831fc 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -2,28 +2,24 @@ set(TARGET ext_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp) +if (WIN32) + add_library(${TARGET} SHARED ../../../ext_server/ext_server.cpp ../../llama.cpp) +else() + add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp ../../llama.cpp) +endif() target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../..) target_include_directories(${TARGET} PRIVATE ../../..) target_compile_features(${TARGET} PRIVATE cxx_std_11) target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1) -target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_definitions(${TARGET} PRIVATE - SERVER_VERBOSE=$ -) - -if (BUILD_SHARED_LIBS) - set_target_properties(ext_server PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(ext_server PRIVATE LLAMA_SHARED LLAMA_BUILD) - add_library(ext_server_shared SHARED $) - target_link_libraries(ext_server_shared PRIVATE ggml llama llava common ${CMAKE_THREAD_LIBS_INIT}) - install(TARGETS ext_server_shared LIBRARY) -endif() +target_link_libraries(${TARGET} PRIVATE ggml llava common ) +set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$) +install(TARGETS ext_server LIBRARY) if (CUDAToolkit_FOUND) target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) if (WIN32) - target_link_libraries(ext_server_shared PRIVATE nvml) + target_link_libraries(${TARGET} PRIVATE nvml) endif() endif() \ No newline at end of file diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index beec1be9..703cb1d4 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -1,15 +1,44 @@ # common logic accross linux and darwin init_vars() { + case "${GOARCH}" in + "amd64") + ARCH="x86_64" + ;; + "arm64") + ARCH="arm64" + ;; + *) + ARCH=$(uname -m | sed -e "s/aarch64/arm64/g") + esac + LLAMACPP_DIR=../llama.cpp CMAKE_DEFS="" - CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static" + CMAKE_TARGETS="--target ext_server" if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then - CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on" + CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" else # TODO - add additional optimization flags... - CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off" + CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}" fi + case $(uname -s) in + "Darwin") + LIB_EXT="dylib" + WHOLE_ARCHIVE="-Wl,-force_load" + NO_WHOLE_ARCHIVE="" + GCC_ARCH="-arch ${ARCH}" + ;; + "Linux") + LIB_EXT="so" + WHOLE_ARCHIVE="-Wl,--whole-archive" + NO_WHOLE_ARCHIVE="-Wl,--no-whole-archive" + + # Cross compiling not supported on linux - Use docker + GCC_ARCH="" + ;; + *) + ;; + esac } git_module_setup() { @@ -40,25 +69,29 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 + mkdir -p ${BUILD_DIR}/lib/ + g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \ + ${GCC_ARCH} \ + ${WHOLE_ARCHIVE} ${BUILD_DIR}/examples/server/libext_server.a ${NO_WHOLE_ARCHIVE} \ + ${BUILD_DIR}/common/libcommon.a \ + ${BUILD_DIR}/libllama.a \ + -Wl,-rpath,\$ORIGIN \ + -lpthread -ldl -lm \ + ${EXTRA_LIBS} } -install() { - rm -rf ${BUILD_DIR}/lib - mkdir -p ${BUILD_DIR}/lib - cp ${BUILD_DIR}/examples/server/libext_server.a ${BUILD_DIR}/lib - cp ${BUILD_DIR}/common/libcommon.a ${BUILD_DIR}/lib - cp ${BUILD_DIR}/libllama.a ${BUILD_DIR}/lib - cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib -} - -link_server_lib() { - gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ - -Wl,--whole-archive \ - ${BUILD_DIR}/lib/libext_server.a \ - -Wl,--no-whole-archive \ - ${BUILD_DIR}/lib/libcommon.a \ - ${BUILD_DIR}/lib/libllama.a \ - -lstdc++ +compress_libs() { + echo "Compressing payloads to reduce overall binary size..." + pids="" + for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do + bzip2 -v9 ${lib} & + pids+=" $!" + done + echo + for pid in ${pids}; do + wait $pid + done + echo "Finished compression" } # Keep the local tree clean after we're done with the build diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 3a57d0cb..cf131af5 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -9,16 +9,52 @@ set -o pipefail echo "Starting darwin generate script" source $(dirname $0)/gen_common.sh init_vars -CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}" -BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal" +git_module_setup +apply_patches + +COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_ACCELERATE=off" + case "${GOARCH}" in "amd64") - CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_METAL=off -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - ARCH="x86_64" + COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off" + + # + # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) + # + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu" + echo "Building LCD CPU" + build + compress_libs + + # + # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance + # Approximately 400% faster than LCD on same CPU + # + init_vars + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx" + echo "Building AVX CPU" + build + compress_libs + + # + # ~2013 CPU Dynamic library + # Approximately 10% faster than AVX on same CPU + # + init_vars + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2" + echo "Building AVX2 CPU" + build + compress_libs ;; "arm64") - CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DLLAMA_METAL=on ${CMAKE_DEFS}" - ARCH="arm64" + CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}" + BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal" + EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" + build + compress_libs ;; *) echo "GOARCH must be set" @@ -27,21 +63,4 @@ case "${GOARCH}" in ;; esac -git_module_setup -apply_patches -build -install -gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ - -arch ${ARCH} \ - -Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \ - ${BUILD_DIR}/lib/libcommon.a \ - ${BUILD_DIR}/lib/libllama.a \ - ${BUILD_DIR}/lib/libggml_static.a \ - -lpthread -ldl -lm -lc++ \ - -framework Accelerate \ - -framework Foundation \ - -framework Metal \ - -framework MetalKit \ - -framework MetalPerformanceShaders - cleanup diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 10ca450b..f5085bda 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -2,16 +2,14 @@ # This script is intended to run inside the go generate # working directory must be llm/generate/ -# First we build our default built-in library which will be linked into the CGO -# binary as a normal dependency. This default build is CPU based. +# First we build one or more CPU based LLM libraries # -# Then we build a CUDA dynamic library (although statically linked with the CUDA -# library dependencies for maximum portability) +# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required +# library dependencies # -# Then if we detect ROCm, we build a dynamically loaded ROCm lib. ROCm is particularly -# important to be a dynamic lib even if it's the only GPU library detected because -# we can't redistribute the objectfiles but must rely on dynamic libraries at -# runtime, which could lead the server not to start if not present. +# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM +# libraries are quite large, and also dynamically load data files at runtime +# which in turn are large, so we don't attempt to cary them as payload set -ex set -o pipefail @@ -59,11 +57,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" echo "Building custom CPU" build - install - link_server_lib + compress_libs else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer @@ -80,11 +77,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu" echo "Building LCD CPU" build - install - link_server_lib + compress_libs # # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance @@ -92,11 +88,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx" echo "Building AVX CPU" build - install - link_server_lib + compress_libs # # ~2013 CPU Dynamic library @@ -104,11 +99,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" build - install - link_server_lib + compress_libs fi else echo "Skipping CPU generation step as requested" @@ -127,22 +121,27 @@ if [ -d "${CUDA_LIB_DIR}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}" + EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" build - install - gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ - -Wl,--whole-archive \ - ${BUILD_DIR}/lib/libext_server.a \ - ${BUILD_DIR}/lib/libcommon.a \ - ${BUILD_DIR}/lib/libllama.a \ - -Wl,--no-whole-archive \ - ${CUDA_LIB_DIR}/libcudart_static.a \ - ${CUDA_LIB_DIR}/libcublas_static.a \ - ${CUDA_LIB_DIR}/libcublasLt_static.a \ - ${CUDA_LIB_DIR}/libcudadevrt.a \ - ${CUDA_LIB_DIR}/libculibos.a \ - -lcuda \ - -lrt -lpthread -ldl -lstdc++ -lm + + # Cary the CUDA libs as payloads to help reduce dependency burden on users + # + # TODO - in the future we may shift to packaging these separately and conditionally + # downloading them in the install script. + DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )" + for lib in libcudart.so libcublas.so libcublasLt.so ; do + DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) + if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then + cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/" + elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then + cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/" + else + cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/" + fi + done + compress_libs + fi if [ -z "${ROCM_PATH}" ]; then @@ -164,19 +163,13 @@ if [ -d "${ROCM_PATH}" ]; then fi init_vars CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" - BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm${ROCM_VARIANT}" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}" + EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,${ROCM_PATH}/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" build - install - gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ - -Wl,--whole-archive \ - ${BUILD_DIR}/lib/libext_server.a \ - ${BUILD_DIR}/lib/libcommon.a \ - ${BUILD_DIR}/lib/libllama.a \ - -Wl,--no-whole-archive \ - -lrt -lpthread -ldl -lstdc++ -lm \ - -L/opt/rocm/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ \ - -Wl,-rpath,/opt/rocm/lib,-rpath,/opt/amdgpu/lib/x86_64-linux-gnu/ \ - -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu + + # Note: the ROCM libs and runtime library files are too large to embed, so we depend on + # them being present at runtime on the host + compress_libs fi cleanup diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 1bc08c69..dc0992f5 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -5,7 +5,8 @@ $ErrorActionPreference = "Stop" function init_vars { $script:llamacppDir = "../llama.cpp" $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A","x64") - $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static") + $script:cmakeTargets = @("ext_server") + $script:ARCH = "amd64" # arm not yet supported. if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on") $script:config = "RelWithDebInfo" @@ -13,6 +14,17 @@ function init_vars { $script:cmakeDefs += @("-DLLAMA_SERVER_VERBOSE=off") $script:config = "Release" } + # Try to find the CUDA dir + if ($env:CUDA_LIB_DIR -eq $null) { + $d=(get-command -ea 'silentlycontinue' nvcc).path + if ($d -ne $null) { + $script:CUDA_LIB_DIR=($d| split-path -parent) + } + } else { + $script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR + } + $script:BZIP2=(get-command -ea 'silentlycontinue' bzip2).path + $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path } function git_module_setup { @@ -47,11 +59,25 @@ function build { function install { rm -ea 0 -recurse -force -path "${script:buildDir}/lib" md "${script:buildDir}/lib" -ea 0 > $null - cp "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" "${script:buildDir}/lib" + cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib" cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib" # Display the dll dependencies in the build log - dumpbin /dependents "${script:buildDir}/bin/${script:config}/ext_server_shared.dll" | select-string ".dll" + if ($script:DUMPBIN -ne $null) { + & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll" + } +} + +function compress_libs { + if ($script:BZIP2 -eq $null) { + write-host "bzip2 not installed, not compressing files" + return + } + write-host "Compressing dlls..." + $libs = dir "${script:buildDir}/lib/*.dll" + foreach ($file in $libs) { + & "$script:BZIP2" -v9 $file + } } function cleanup { @@ -71,33 +97,47 @@ apply_patches $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off") $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs -$script:buildDir="${script:llamacppDir}/build/windows/cpu" +$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu" write-host "Building LCD CPU" build install +compress_libs $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs -$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx" +$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx" write-host "Building AVX CPU" build install +compress_libs $script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs -$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2" +$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2" write-host "Building AVX2 CPU" build install +compress_libs -# Then build cuda as a dynamically loaded library -# TODO figure out how to detect cuda version -init_vars -$script:buildDir="${script:llamacppDir}/build/windows/cuda" -$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on") -build -install - +if ($null -ne $script:CUDA_LIB_DIR) { + # Then build cuda as a dynamically loaded library + $nvcc = (get-command -ea 'silentlycontinue' nvcc) + if ($null -ne $nvcc) { + $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename + } + if ($null -ne $script:CUDA_VERSION) { + $script:CUDA_VARIANT="_"+$script:CUDA_VERSION + } + init_vars + $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT" + $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on") + build + install + cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib" + cp "${script:CUDA_LIB_DIR}/cublas64_*.dll" "${script:buildDir}/lib" + cp "${script:CUDA_LIB_DIR}/cublasLt64_*.dll" "${script:buildDir}/lib" + compress_libs +} # TODO - actually implement ROCm support on windows -$script:buildDir="${script:llamacppDir}/build/windows/rocm" +$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm" rm -ea 0 -recurse -force -path "${script:buildDir}/lib" md "${script:buildDir}/lib" -ea 0 > $null diff --git a/llm/payload_common.go b/llm/payload_common.go index 79ea76de..2b01f966 100644 --- a/llm/payload_common.go +++ b/llm/payload_common.go @@ -1,9 +1,9 @@ package llm import ( + "compress/bzip2" "errors" "fmt" - "golang.org/x/exp/slices" "io" "io/fs" "log" @@ -12,6 +12,9 @@ import ( "runtime" "strings" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" + "github.com/jmorganca/ollama/gpu" ) @@ -20,7 +23,7 @@ import ( // Any library without a variant is the lowest common denominator var availableDynLibs = map[string]string{} -const pathComponentCount = 6 +const pathComponentCount = 7 // getDynLibs returns an ordered list of LLM libraries to try, starting with the best func getDynLibs(gpuInfo gpu.GpuInfo) []string { @@ -100,6 +103,7 @@ func rocmDynLibPresent() bool { } func nativeInit(workdir string) error { + log.Printf("Extracting dynamic libraries...") if runtime.GOOS == "darwin" { err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal") if err != nil { @@ -113,7 +117,7 @@ func nativeInit(workdir string) error { os.Setenv("GGML_METAL_PATH_RESOURCES", workdir) } - libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*") + libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/*/lib/*") if err != nil { if err == payloadMissing { log.Printf("%s", payloadMissing) @@ -151,45 +155,61 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) { } libs := []string{} + // TODO consider making this idempotent with some sort of persistent directory (where we store models probably) + // and tracking by version so we don't reexpand the files every time + // Also maybe consider lazy loading only what is needed + + g := new(errgroup.Group) for _, file := range files { pathComps := strings.Split(file, "/") if len(pathComps) != pathComponentCount { log.Printf("unexpected payload components: %v", pathComps) continue } - // llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY - // Include the variant in the path to avoid conflicts between multiple server libs - targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3]) - srcFile, err := libEmbed.Open(file) - if err != nil { - return nil, fmt.Errorf("read payload %s: %v", file, err) - } - defer srcFile.Close() - if err := os.MkdirAll(targetDir, 0o755); err != nil { - return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err) - } - destFile := filepath.Join(targetDir, filepath.Base(file)) - if strings.Contains(destFile, "server") { - libs = append(libs, destFile) - } - - _, err = os.Stat(destFile) - switch { - case errors.Is(err, os.ErrNotExist): - destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + file := file + g.Go(func() error { + // llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY + // Include the variant in the path to avoid conflicts between multiple server libs + targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3]) + srcFile, err := libEmbed.Open(file) if err != nil { - return nil, fmt.Errorf("write payload %s: %v", file, err) + return fmt.Errorf("read payload %s: %v", file, err) } - defer destFile.Close() - if _, err := io.Copy(destFile, srcFile); err != nil { - return nil, fmt.Errorf("copy payload %s: %v", file, err) + defer srcFile.Close() + if err := os.MkdirAll(targetDir, 0o755); err != nil { + return fmt.Errorf("create payload temp dir %s: %v", workDir, err) } - case err != nil: - return nil, fmt.Errorf("stat payload %s: %v", file, err) - } + src := io.Reader(srcFile) + filename := file + if strings.HasSuffix(file, ".bz2") { + src = bzip2.NewReader(src) + filename = strings.TrimSuffix(filename, ".bz2") + } + + destFile := filepath.Join(targetDir, filepath.Base(filename)) + if strings.Contains(destFile, "server") { + libs = append(libs, destFile) + } + + _, err = os.Stat(destFile) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write payload %s: %v", file, err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, src); err != nil { + return fmt.Errorf("copy payload %s: %v", file, err) + } + case err != nil: + return fmt.Errorf("stat payload %s: %v", file, err) + } + return nil + }) } - return libs, nil + return libs, g.Wait() } func extractPayloadFiles(workDir, glob string) error { @@ -207,8 +227,14 @@ func extractPayloadFiles(workDir, glob string) error { if err := os.MkdirAll(workDir, 0o755); err != nil { return fmt.Errorf("create payload temp dir %s: %v", workDir, err) } + src := io.Reader(srcFile) + filename := file + if strings.HasSuffix(file, ".bz2") { + src = bzip2.NewReader(src) + filename = strings.TrimSuffix(filename, ".bz2") + } - destFile := filepath.Join(workDir, filepath.Base(file)) + destFile := filepath.Join(workDir, filepath.Base(filename)) _, err = os.Stat(destFile) switch { case errors.Is(err, os.ErrNotExist): @@ -217,7 +243,7 @@ func extractPayloadFiles(workDir, glob string) error { return fmt.Errorf("write payload %s: %v", file, err) } defer destFile.Close() - if _, err := io.Copy(destFile, srcFile); err != nil { + if _, err := io.Copy(destFile, src); err != nil { return fmt.Errorf("copy payload %s: %v", file, err) } case err != nil: diff --git a/llm/payload_darwin.go b/llm/payload_darwin.go deleted file mode 100644 index 1a5f042a..00000000 --- a/llm/payload_darwin.go +++ /dev/null @@ -1,8 +0,0 @@ -package llm - -import ( - "embed" -) - -//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so -var libEmbed embed.FS diff --git a/llm/payload_darwin_amd64.go b/llm/payload_darwin_amd64.go new file mode 100644 index 00000000..a1c70ba9 --- /dev/null +++ b/llm/payload_darwin_amd64.go @@ -0,0 +1,8 @@ +package llm + +import ( + "embed" +) + +//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/x86_64/*/lib/*.dylib* +var libEmbed embed.FS diff --git a/llm/payload_darwin_arm64.go b/llm/payload_darwin_arm64.go new file mode 100644 index 00000000..aa70c931 --- /dev/null +++ b/llm/payload_darwin_arm64.go @@ -0,0 +1,8 @@ +package llm + +import ( + "embed" +) + +//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib* +var libEmbed embed.FS diff --git a/llm/payload_linux.go b/llm/payload_linux.go index afef040a..fc366209 100644 --- a/llm/payload_linux.go +++ b/llm/payload_linux.go @@ -4,5 +4,5 @@ import ( "embed" ) -//go:embed llama.cpp/build/linux/*/lib/*.so +//go:embed llama.cpp/build/linux/*/*/lib/*.so* var libEmbed embed.FS diff --git a/llm/payload_windows.go b/llm/payload_windows.go index 21c6cc4d..d195745a 100644 --- a/llm/payload_windows.go +++ b/llm/payload_windows.go @@ -4,5 +4,5 @@ import ( "embed" ) -//go:embed llama.cpp/build/windows/*/lib/*.dll +//go:embed llama.cpp/build/windows/*/*/lib/*.dll* var libEmbed embed.FS diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh index 238fba87..114240e3 100755 --- a/scripts/build_darwin.sh +++ b/scripts/build_darwin.sh @@ -1,6 +1,6 @@ #!/bin/sh -set -eu +set -e export VERSION=${VERSION:-0.0.0} export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version=$VERSION\" \"-X=github.com/jmorganca/ollama/server.mode=release\"'" @@ -11,21 +11,36 @@ for TARGETARCH in arm64 amd64; do rm -rf llm/llama.cpp/build GOOS=darwin GOARCH=$TARGETARCH go generate ./... CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -o dist/ollama-darwin-$TARGETARCH + CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -cover -o dist/ollama-darwin-$TARGETARCH-cov done -lipo -create -output dist/ollama dist/ollama-darwin-* -rm -f dist/ollama-darwin-* -codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama +lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64 +rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64 +if [ -n "$APPLE_IDENTITY" ]; then + codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama +else + echo "Skipping code signing - set APPLE_IDENTITY" +fi chmod +x dist/ollama -# build and sign the mac app +# build and optionally sign the mac app npm install --prefix app -npm run --prefix app make:sign +if [ -n "$APPLE_IDENTITY" ]; then + npm run --prefix app make:sign +else + npm run --prefix app make +fi cp app/out/make/zip/darwin/universal/Ollama-darwin-universal-$VERSION.zip dist/Ollama-darwin.zip # sign the binary and rename it -codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama +if [ -n "$APPLE_IDENTITY" ]; then + codesign -f --timestamp -s "$APPLE_IDENTITY" --identifier ai.ollama.ollama --options=runtime dist/ollama +else + echo "WARNING: Skipping code signing - set APPLE_IDENTITY" +fi ditto -c -k --keepParent dist/ollama dist/temp.zip -xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID +if [ -n "$APPLE_IDENTITY" ]; then + xcrun notarytool submit dist/temp.zip --wait --timeout 10m --apple-id $APPLE_ID --password $APPLE_PASSWORD --team-id $APPLE_TEAM_ID +fi mv dist/ollama dist/ollama-darwin rm -f dist/temp.zip diff --git a/scripts/build_remote.py b/scripts/build_remote.py index db824e4b..314232ac 100755 --- a/scripts/build_remote.py +++ b/scripts/build_remote.py @@ -66,3 +66,7 @@ subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'generate', './... print("Building") subprocess.check_call(['ssh', netloc, 'cd', path, ';', GoCmd, 'build', '.']) +print("Copying built result") +subprocess.check_call(['scp', netloc +":"+ path + "/ollama.exe", './dist/']) + + diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh index ec6b20a0..1d5b181c 100644 --- a/scripts/rh_linux_deps.sh +++ b/scripts/rh_linux_deps.sh @@ -28,6 +28,7 @@ fi if [ -n "${CMAKE_VERSION}" ]; then curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 + dnf install -y bzip2 fi if [ -n "${GOLANG_VERSION}" ]; then