diff --git a/Dockerfile b/Dockerfile index 8eb90057..79b2a696 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,9 @@ ARG CMAKE_VERSION=3.22.1 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md ARG CUDA_VERSION=11.3.1 ARG ROCM_VERSION=6.1.2 +ARG JETPACK_6=r36.2.0 +ARG JETPACK_5=r35.4.1 +ARG JETPACK_4=r32.7.1 # Copy the minimal context we need to run the generate scripts FROM scratch AS llm-code @@ -22,7 +25,7 @@ ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -31,11 +34,40 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH arm64 +RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh + +FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 +ARG CMAKE_VERSION +RUN apt-get update && apt-get install -y git curl && \ + curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH arm64 +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ - CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ - CUDA_VARIANT="_v11" \ + CUDA_VARIANT="_jetpack6" \ + CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \ + CMAKE_CUDA_ARCHITECTURES="87" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64 +ARG CMAKE_VERSION +RUN apt-get update && apt-get install -y git curl && \ + curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH arm64 +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CUDA_VARIANT="_jetpack5" \ + CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \ + CMAKE_CUDA_ARCHITECTURES="72;87" \ bash gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 @@ -123,8 +155,14 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +## arm binary += 381M +COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +## arm binary += 330M +COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ diff --git a/gpu/gpu.go b/gpu/gpu.go index d0ae0f34..22461922 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -15,7 +15,9 @@ import ( "log/slog" "os" "path/filepath" + "regexp" "runtime" + "strconv" "strings" "sync" "unsafe" @@ -215,7 +217,7 @@ func GetGPUInfo() GpuInfoList { GpuInfo: GpuInfo{ memInfo: mem, Library: "cpu", - Variant: cpuCapability, + Variant: cpuCapability.String(), ID: "0", }, }, @@ -231,6 +233,35 @@ func GetGPUInfo() GpuInfoList { depPath := GetDepDir() + var cudaVariant string + if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { + if CudaTegra != "" { + ver := strings.Split(CudaTegra, ".") + if len(ver) > 0 { + cudaVariant = "jetpack" + ver[0] + } + } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { + r := regexp.MustCompile(` R(\d+) `) + m := r.FindSubmatch(data) + if len(m) != 2 { + slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") + } else { + if l4t, err := strconv.Atoi(string(m[1])); err == nil { + // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) + // https://developer.nvidia.com/embedded/jetpack-archive + switch l4t { + case 35: + cudaVariant = "jetpack5" + case 36: + cudaVariant = "jetpack6" + default: + slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) + } + } + } + } + } + // Load ALL libraries cHandles = initCudaHandles() @@ -240,6 +271,7 @@ func GetGPUInfo() GpuInfoList { gpuInfo := CudaGPUInfo{ GpuInfo: GpuInfo{ Library: "cuda", + Variant: cudaVariant, }, index: i, } @@ -266,7 +298,15 @@ func GetGPUInfo() GpuInfoList { gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory - gpuInfo.DependencyPath = depPath + if depPath != "" { + gpuInfo.DependencyPath = depPath + // Check for variant specific directory + if cudaVariant != "" { + if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil { + gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant) + } + } + } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 9d9fd84e..417b48df 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } @@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } diff --git a/gpu/types.go b/gpu/types.go index 8d22b06b..fc628d47 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -19,7 +19,7 @@ type GpuInfo struct { Library string `json:"library,omitempty"` // Optional variant to select (e.g. versions, cpu feature flags) - Variant CPUCapability `json:"variant"` + Variant string `json:"variant"` // MinimumMemory represents the minimum memory required to use the GPU MinimumMemory uint64 `json:"-"` @@ -81,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { for _, info := range l { found := false requested := info.Library - if info.Variant != CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != CPUCapabilityNone.String() { + requested += "_" + info.Variant } for i, lib := range libs { if lib == requested { diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 1365d07d..dc9dda5a 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -165,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then echo "CUDA libraries detected - building dynamic CUDA library" init_vars CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) - if [ -n "${CUDA_MAJOR}" ]; then + if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi if [ "${ARCH}" == "arm64" ]; then @@ -189,9 +189,10 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" - CUDA_DIST_DIR="${DIST_BASE}/ollama_libs" + CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}" build install + echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" mkdir -p "${CUDA_DIST_DIR}" for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do cp -a "${lib}" "${CUDA_DIST_DIR}" diff --git a/llm/payload.go b/llm/payload.go index b402e1f2..963b3295 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string { // glob workDir for files that start with ollama_ availableServers := getAvailableServers() requested := info.Library - if info.Variant != gpu.CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != gpu.CPUCapabilityNone.String() { + requested += "_" + info.Variant } servers := []string{} diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index ebb60c5a..adda2ad7 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -22,6 +22,7 @@ for TARGETARCH in ${BUILD_ARCH}; do -t builder:$TARGETARCH \ . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH + rm -rf ./dist/linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist docker rm builder-$TARGETARCH echo "Compressing final linux bundle..."