diff --git a/.dockerignore b/.dockerignore index f27e7fd6..a1f8beae 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,7 +2,7 @@ ollama app dist -llm/llama.cpp/gguf +llm/llama.cpp .env .cache test_data \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index 5a65e4df..a0a9fd10 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,5 +1,5 @@ [submodule "llm/llama.cpp/gguf"] - path = llm/llama.cpp/gguf + path = llm/llama.cpp/gguf url = https://github.com/ggerganov/llama.cpp.git ignore = dirty shallow = true diff --git a/llm/llama.cpp/CMakeLists.txt b/llm/ext_server/CMakeLists.txt similarity index 94% rename from llm/llama.cpp/CMakeLists.txt rename to llm/ext_server/CMakeLists.txt index 9553ad5e..2450c2d5 100644 --- a/llm/llama.cpp/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -2,7 +2,7 @@ set(TARGET ext_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -add_library(${TARGET} STATIC ../../../ext_server.cpp) +add_library(${TARGET} STATIC ../../../ext_server/ext_server.cpp) target_include_directories(${TARGET} PRIVATE ../../common) target_include_directories(${TARGET} PRIVATE ../..) target_include_directories(${TARGET} PRIVATE ../../..) diff --git a/llm/ext_server/README.md b/llm/ext_server/README.md new file mode 100644 index 00000000..ac58d9c8 --- /dev/null +++ b/llm/ext_server/README.md @@ -0,0 +1,4 @@ +# Extern C Server + +This directory contains a thin facade we layer on top of the Llama.cpp server +to expose `extern C` interfaces to access the functionality through direct API calls in-process diff --git a/llm/llama.cpp/ext_server.cpp b/llm/ext_server/ext_server.cpp similarity index 100% rename from llm/llama.cpp/ext_server.cpp rename to llm/ext_server/ext_server.cpp diff --git a/llm/llama.cpp/ext_server.h b/llm/ext_server/ext_server.h similarity index 100% rename from llm/llama.cpp/ext_server.h rename to llm/ext_server/ext_server.h diff --git a/llm/ext_server_common.go b/llm/ext_server_common.go index 470df412..171bfca1 100644 --- a/llm/ext_server_common.go +++ b/llm/ext_server_common.go @@ -1,7 +1,7 @@ package llm /* -#cgo CFLAGS: -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/gguf -I${SRCDIR}/llama.cpp/gguf/common -I${SRCDIR}/llama.cpp/gguf/examples/server +#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server #cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 #cgo CFLAGS: -Wmissing-noreturn -Wall -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds #cgo CPPFLAGS: -Ofast -Wall -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations -Wno-unused-but-set-variable @@ -10,17 +10,17 @@ package llm #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG #cgo darwin LDFLAGS: -lc++ -framework Accelerate #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libcommon.a -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libext_server.a -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libllama.a -#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/darwin/metal/lib/libggml_static.a +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a +#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a #cgo linux CFLAGS: -D_GNU_SOURCE #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libext_server.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libcommon.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libllama.a -#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/linux/cpu/lib/libggml_static.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a +#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm #cgo linux windows LDFLAGS: -lpthread diff --git a/llm/llama.cpp/gen_common.sh b/llm/generate/gen_common.sh similarity index 70% rename from llm/llama.cpp/gen_common.sh rename to llm/generate/gen_common.sh index 4346dd7a..9574e6fd 100644 --- a/llm/llama.cpp/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -1,7 +1,7 @@ # common logic accross linux and darwin init_vars() { - LLAMACPP_DIR=gguf + LLAMACPP_DIR=../llama.cpp PATCHES="0001-Expose-callable-API-for-server.patch" CMAKE_DEFS="" CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static" @@ -19,18 +19,18 @@ git_module_setup() { return fi git submodule init - git submodule update --force gguf + git submodule update --force ${LLAMACPP_DIR} } apply_patches() { # Wire up our CMakefile - if ! grep ollama gguf/examples/server/CMakeLists.txt; then - echo 'include (../../../CMakeLists.txt) # ollama' >>gguf/examples/server/CMakeLists.txt + if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then + echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt fi # Avoid duplicate main symbols when we link into the cgo binary - sed -e 's/int main(/int __main(/g' <./gguf/examples/server/server.cpp >./gguf/examples/server/server.cpp.tmp && - mv ./gguf/examples/server/server.cpp.tmp ./gguf/examples/server/server.cpp + sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp && + mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp } build() { @@ -49,5 +49,5 @@ install() { # Keep the local tree clean after we're done with the build cleanup() { - (cd gguf/examples/server/ && git checkout CMakeLists.txt server.cpp) + (cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp) } diff --git a/llm/llama.cpp/gen_darwin.sh b/llm/generate/gen_darwin.sh similarity index 90% rename from llm/llama.cpp/gen_darwin.sh rename to llm/generate/gen_darwin.sh index 2924946a..97983a3f 100755 --- a/llm/llama.cpp/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -1,6 +1,6 @@ #!/bin/bash # This script is intended to run inside the go generate -# working directory must be ../llm/llama.cpp +# working directory must be ./llm/generate/ # TODO - add hardening to detect missing tools (cmake, etc.) @@ -10,7 +10,7 @@ echo "Starting darwin generate script" source $(dirname $0)/gen_common.sh init_vars CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on ${CMAKE_DEFS}" -BUILD_DIR="gguf/build/darwin/metal" +BUILD_DIR="${LLAMACPP_DIR}/build/darwin/metal" case "${GOARCH}" in "amd64") CMAKE_DEFS="-DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" diff --git a/llm/llama.cpp/gen_linux.sh b/llm/generate/gen_linux.sh similarity index 95% rename from llm/llama.cpp/gen_linux.sh rename to llm/generate/gen_linux.sh index c3ca280b..52081156 100755 --- a/llm/llama.cpp/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -1,6 +1,6 @@ #!/bin/bash # This script is intended to run inside the go generate -# working directory must be llm/llama.cpp +# working directory must be llm/generate/ # First we build our default built-in library which will be linked into the CGO # binary as a normal dependency. This default build is CPU based. @@ -52,7 +52,7 @@ apply_patches # CPU first for the default library # CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" -BUILD_DIR="gguf/build/linux/cpu" +BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu" build install @@ -64,7 +64,7 @@ if [ -d /usr/local/cuda/lib64/ ]; then echo "CUDA libraries detected - building dynamic CUDA library" init_vars CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" - BUILD_DIR="gguf/build/linux/cuda" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda" CUDA_LIB_DIR=/usr/local/cuda/lib64 build install @@ -98,7 +98,7 @@ if [ -d "${ROCM_PATH}" ]; then echo "ROCm libraries detected - building dynamic ROCm library" init_vars CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" - BUILD_DIR="gguf/build/linux/rocm" + BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm" build install gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \ diff --git a/llm/llama.cpp/gen_windows.ps1 b/llm/generate/gen_windows.ps1 similarity index 72% rename from llm/llama.cpp/gen_windows.ps1 rename to llm/generate/gen_windows.ps1 index e8e5835d..8675ae43 100644 --- a/llm/llama.cpp/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -3,6 +3,7 @@ $ErrorActionPreference = "Stop" function init_vars { + $script:llamacppDir = "../llama.cpp" $script:patches = @("0001-Expose-callable-API-for-server.patch") $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64") $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static") @@ -19,25 +20,25 @@ function git_module_setup { # TODO add flags to skip the init/patch logic to make it easier to mod llama.cpp code in-repo & git submodule init if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} - & git submodule update --force gguf + & git submodule update --force "${script:llamacppDir}" if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } function apply_patches { # Wire up our CMakefile - if (!(Select-String -Path "gguf/examples/server/CMakeLists.txt" -Pattern 'ollama')) { - Add-Content -Path "gguf/examples/server/CMakeLists.txt" -Value 'include (../../../CMakeLists.txt) # ollama' + if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) { + Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama' } # Avoid duplicate main symbols when we link into the cgo binary - $content = Get-Content -Path "./gguf/examples/server/server.cpp" + $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp" $content = $content -replace 'int main\(', 'int __main(' - Set-Content -Path "./gguf/examples/server/server.cpp" -Value $content + Set-Content -Path "${script:llamacppDir}/examples/server/server.cpp" -Value $content } function build { - write-host "generating config with: cmake -S gguf -B $script:buildDir $script:cmakeDefs" + write-host "generating config with: cmake -S ${script:llamacppDir} -B $script:buildDir $script:cmakeDefs" & cmake --version - & cmake -S gguf -B $script:buildDir $script:cmakeDefs + & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })" & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) @@ -55,7 +56,7 @@ function install { } function cleanup { - Set-Location "gguf/examples/server" + Set-Location "${script:llamacppDir}/examples/server" git checkout CMakeLists.txt server.cpp } @@ -64,20 +65,20 @@ git_module_setup apply_patches # first build CPU based -$script:buildDir="gguf/build/windows/cpu" +$script:buildDir="${script:llamacppDir}/build/windows/cpu" build install # Then build cuda as a dynamically loaded library init_vars -$script:buildDir="gguf/build/windows/cuda" +$script:buildDir="${script:llamacppDir}/build/windows/cuda" $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON") build install # TODO - actually implement ROCm support on windows -$script:buildDir="gguf/build/windows/rocm" +$script:buildDir="${script:llamacppDir}/build/windows/rocm" rm -ea 0 -recurse -force -path "${script:buildDir}/lib" md "${script:buildDir}/lib" -ea 0 > $null diff --git a/llm/llama.cpp/generate_darwin.go b/llm/generate/generate_darwin.go similarity index 66% rename from llm/llama.cpp/generate_darwin.go rename to llm/generate/generate_darwin.go index 498e5005..322879e9 100644 --- a/llm/llama.cpp/generate_darwin.go +++ b/llm/generate/generate_darwin.go @@ -1,3 +1,3 @@ -package llm +package generate //go:generate sh ./gen_darwin.sh diff --git a/llm/llama.cpp/generate_linux.go b/llm/generate/generate_linux.go similarity index 67% rename from llm/llama.cpp/generate_linux.go rename to llm/generate/generate_linux.go index 119b5c27..2b7e116d 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/generate/generate_linux.go @@ -1,3 +1,3 @@ -package llm +package generate //go:generate bash ./gen_linux.sh diff --git a/llm/llama.cpp/generate_windows.go b/llm/generate/generate_windows.go similarity index 81% rename from llm/llama.cpp/generate_windows.go rename to llm/generate/generate_windows.go index 87acd827..d2ee5428 100644 --- a/llm/llama.cpp/generate_windows.go +++ b/llm/generate/generate_windows.go @@ -1,3 +1,3 @@ -package llm +package generate //go:generate powershell -ExecutionPolicy Bypass -File ./gen_windows.ps1 diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go index f7427b96..6e416b6d 100644 --- a/llm/shim_darwin.go +++ b/llm/shim_darwin.go @@ -13,7 +13,7 @@ import ( "github.com/jmorganca/ollama/api" ) -//go:embed llama.cpp/gguf/ggml-metal.metal +//go:embed llama.cpp/ggml-metal.metal var libEmbed embed.FS func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { @@ -22,7 +22,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin } func nativeInit(workdir string) error { - err := extractPayloadFiles(workdir, "llama.cpp/gguf/ggml-metal.metal") + err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal") if err != nil { if err == payloadMissing { // TODO perhaps consider this a hard failure on arm macs? diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go index 9538796d..0282d5f7 100644 --- a/llm/shim_ext_server.go +++ b/llm/shim_ext_server.go @@ -34,6 +34,8 @@ type shimExtServer struct { var shimMutex sync.Mutex var llm *shimExtServer +const pathComponentCount = 6 + func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) { C.dynamic_shim_llama_server_init(llm.s, sparams, err) } @@ -112,7 +114,7 @@ func (llm *shimExtServer) Close() { } func nativeInit(workdir string) error { - libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/*/*/lib/*") + libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*") if err != nil { if err == payloadMissing { log.Printf("%s", payloadMissing) @@ -151,13 +153,13 @@ func extractDynamicLibs(workDir, glob string) ([]string, error) { for _, file := range files { pathComps := strings.Split(file, "/") - if len(pathComps) != 7 { + if len(pathComps) != pathComponentCount { log.Printf("unexpected payload components: %v", pathComps) continue } - // llama.cpp/gguf/build/$OS/$VARIANT/lib/$LIBRARY + // llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY // Include the variant in the path to avoid conflicts between multiple server libs - targetDir := filepath.Join(workDir, pathComps[4]) + targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3]) srcFile, err := libEmbed.Open(file) if err != nil { return nil, fmt.Errorf("read payload %s: %v", file, err) diff --git a/llm/shim_ext_server_linux.go b/llm/shim_ext_server_linux.go index 83409fc8..e0ad5da4 100644 --- a/llm/shim_ext_server_linux.go +++ b/llm/shim_ext_server_linux.go @@ -10,7 +10,7 @@ import ( "strings" ) -//go:embed llama.cpp/gguf/build/*/*/lib/*.so +//go:embed llama.cpp/build/*/*/lib/*.so var libEmbed embed.FS func updatePath(dir string) { diff --git a/llm/shim_ext_server_windows.go b/llm/shim_ext_server_windows.go index b140c22c..e95c8afa 100644 --- a/llm/shim_ext_server_windows.go +++ b/llm/shim_ext_server_windows.go @@ -8,7 +8,7 @@ import ( "strings" ) -//go:embed llama.cpp/gguf/build/windows/*/lib/*.dll +//go:embed llama.cpp/build/windows/*/lib/*.dll var libEmbed embed.FS func updatePath(dir string) { diff --git a/llm/llama.cpp/gguf b/tmp similarity index 100% rename from llm/llama.cpp/gguf rename to tmp