diff --git a/.dockerignore b/.dockerignore index 43f2e07d..fada7a9b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,3 +7,5 @@ llm/llama.cpp .env .cache test_data +llm/build +llama/build diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9c1e3e13..257d136a 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -102,8 +102,8 @@ jobs: with: name: generate-windows-cpu path: | - llm/build/**/bin/* - llm/build/**/*.a + build/**/* + build/**/*.a dist/windows-amd64/** # ROCm generation step @@ -176,7 +176,7 @@ jobs: with: name: generate-windows-rocm path: | - llm/build/**/bin/* + build/**/* dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: @@ -265,7 +265,7 @@ jobs: with: name: generate-windows-cuda-${{ matrix.cuda.version }} path: | - llm/build/**/bin/* + build/**/* dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: @@ -338,7 +338,7 @@ jobs: - uses: actions/download-artifact@v4 with: name: generate-windows-rocm - - run: dir llm/build + - run: dir build - run: | $gopath=(get-command go).source | split-path -parent & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" @@ -359,9 +359,7 @@ jobs: environment: release runs-on: linux env: - OLLAMA_SKIP_MANIFEST_CREATE: '1' - BUILD_ARCH: amd64 - PUSH: '1' + PLATFORM: linux/amd64 steps: - uses: actions/checkout@v4 with: @@ -369,14 +367,8 @@ jobs: - name: Set Version shell: bash run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ vars.DOCKER_USER }} - password: ${{ secrets.DOCKER_ACCESS_TOKEN }} - run: | ./scripts/build_linux.sh - ./scripts/build_docker.sh - uses: actions/upload-artifact@v4 with: name: dist-linux-amd64 @@ -390,9 +382,7 @@ jobs: environment: release runs-on: linux-arm64 env: - OLLAMA_SKIP_MANIFEST_CREATE: '1' - BUILD_ARCH: arm64 - PUSH: '1' + PLATFORM: linux/arm64 steps: - uses: actions/checkout@v4 with: @@ -421,14 +411,8 @@ jobs: sudo usermod -aG docker $USER sudo apt-get install acl sudo setfacl --modify user:$USER:rw /var/run/docker.sock - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ vars.DOCKER_USER }} - password: ${{ secrets.DOCKER_ACCESS_TOKEN }} - run: | ./scripts/build_linux.sh - ./scripts/build_docker.sh - uses: actions/upload-artifact@v4 with: name: dist-linux-arm64 @@ -436,6 +420,181 @@ jobs: dist/*linux* !dist/*-cov + # Container image build + build-linux: + environment: release + strategy: + matrix: + runner: + - linux + - linux-arm64 + runs-on: ${{ matrix.runner }} + env: + FINAL_IMAGE_REPO: ollama/ollama + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: 'Install Docker' + if: ${{ startsWith(matrix.runner, 'linux-arm64') }} + run: | + sudo apt-get update + sudo apt-get install -y ca-certificates curl + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io + sudo usermod -aG docker $USER + sudo apt-get install acl + sudo setfacl --modify user:$USER:rw /var/run/docker.sock + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FINAL_IMAGE_REPO }} + flavor: | + latest=false + tags: | + type=ref,event=tag + type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr + type=semver,pattern={{version}} + - name: Set Version + shell: bash + run: | + machine=$(uname -m) + case ${machine} in + x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;; + aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;; + esac >>$GITHUB_ENV + echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKER_USER }} + password: ${{ secrets.DOCKER_ACCESS_TOKEN }} + - name: Build and push by digest + id: build + uses: docker/build-push-action@v6 + with: + context: "." + platforms: linux/${{ env.ARCH }} + build-args: | + GOFLAGS + outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true + - name: Export digest + run: | + mkdir -p /tmp/digests + digest="${{ steps.build.outputs.digest }}" + touch "/tmp/digests/${digest#sha256:}" + - name: Upload digest + uses: actions/upload-artifact@v4 + with: + name: digests-${{ env.PLATFORM_PAIR }} + path: /tmp/digests/* + if-no-files-found: error + retention-days: 1 + merge: + environment: release + runs-on: linux + needs: + - build-linux + env: + FINAL_IMAGE_REPO: ollama/ollama + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: digests-* + merge-multiple: true + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FINAL_IMAGE_REPO }} + flavor: | + latest=false + tags: | + type=ref,event=tag + type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr + type=semver,pattern={{version}} + - name: Set Version + shell: bash + run: | + machine=$(uname -m) + case ${machine} in + x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;; + aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;; + esac >>$GITHUB_ENV + echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKER_USER }} + password: ${{ secrets.DOCKER_ACCESS_TOKEN }} + - name: Create manifest list and push + working-directory: /tmp/digests + run: | + docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *) + - name: Inspect image + run: | + docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }} + build-linux-rocm: + environment: release + runs-on: linux + env: + FINAL_IMAGE_REPO: ollama/ollama + ARCH: amd64 + PLATFORM_PAIR: linux-amd64 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FINAL_IMAGE_REPO }} + flavor: | + latest=false + tags: | + type=ref,event=tag + type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr + type=semver,pattern={{version}} + - name: Set Version + shell: bash + run: | + echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKER_USER }} + password: ${{ secrets.DOCKER_ACCESS_TOKEN }} + - name: Build and push by digest + id: build + uses: docker/build-push-action@v6 + with: + context: "." + target: runtime-rocm + build-args: | + GOFLAGS + tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm + push: true + # Aggregate all the assets and ship a release release: needs: @@ -448,8 +607,6 @@ jobs: permissions: contents: write env: - OLLAMA_SKIP_IMAGE_BUILD: '1' - PUSH: '1' GH_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v4 @@ -458,12 +615,6 @@ jobs: run: | echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ vars.DOCKER_USER }} - password: ${{ secrets.DOCKER_ACCESS_TOKEN }} - - run: ./scripts/build_docker.sh - name: Retrieve built artifact uses: actions/download-artifact@v4 with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3d58fa3e..26dc732a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -81,12 +81,6 @@ jobs: if: ${{ ! startsWith(matrix.os, 'windows-') }} name: 'Unix Go Generate' - run: go build . - - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.os }}-${{ matrix.arch }}-libraries - path: | - llm/build/**/bin/* - llm/build/**/*.a generate-cuda: needs: [changes] if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} @@ -114,12 +108,6 @@ jobs: go generate -x ./... env: OLLAMA_SKIP_CPU_GENERATE: '1' - - uses: actions/upload-artifact@v4 - with: - name: cuda-${{ matrix.cuda-version }}-libraries - path: | - llm/build/**/bin/* - dist/windows-amd64/** generate-rocm: needs: [changes] if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} @@ -147,12 +135,6 @@ jobs: go generate -x ./... env: OLLAMA_SKIP_CPU_GENERATE: '1' - - uses: actions/upload-artifact@v4 - with: - name: rocm-${{ matrix.rocm-version }}-libraries - path: | - llm/build/**/bin/* - dist/windows-amd64/** # ROCm generation step generate-windows-rocm: @@ -189,7 +171,6 @@ jobs: name: go generate env: OLLAMA_SKIP_CPU_GENERATE: '1' - # TODO - do we need any artifacts? # CUDA generation step generate-windows-cuda: @@ -231,7 +212,6 @@ jobs: go generate -x ./... env: OLLAMA_SKIP_CPU_GENERATE: '1' - # TODO - do we need any artifacts? lint: strategy: @@ -263,14 +243,6 @@ jobs: arm64) echo ARCH=arm64 ;; esac >>$GITHUB_ENV shell: bash - - run: | - mkdir -p llm/build/linux/$ARCH/stub/bin - touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'ubuntu-') }} - - run: | - mkdir -p llm/build/darwin/$ARCH/stub/bin - touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'macos-') }} - uses: golangci/golangci-lint-action@v6 with: args: --timeout 8m0s -v @@ -301,23 +273,10 @@ jobs: cache: true - run: | case ${{ matrix.arch }} in - amd64) echo ARCH=x86_64 ;; + amd64) echo ARCH=amd64 ;; arm64) echo ARCH=arm64 ;; esac >>$GITHUB_ENV shell: bash - - run: | - mkdir -p llm/build/linux/$ARCH/stub/bin - touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'ubuntu-') }} - - run: | - mkdir -p llm/build/darwin/$ARCH/stub/bin - touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'macos-') }} - shell: bash - run: go generate ./... - run: go build - run: go test -v ./... - - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.os }}-binaries - path: ollama diff --git a/.gitignore b/.gitignore index 0d826ab6..87f8b007 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,7 @@ ggml-metal.metal test_data *.crt llm/build +build/*/*/* +!build/**/placeholder +llama/build __debug_bin* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index e0f94b57..8b849043 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,12 +16,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-1 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG CUDA_V11_ARCHITECTURES -ENV GOARCH amd64 +ENV GOARCH=amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ @@ -33,12 +33,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-1 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG CUDA_V12_ARCHITECTURES -ENV GOARCH amd64 +ENV GOARCH=amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ @@ -47,32 +47,32 @@ RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ bash gen_linux.sh -#FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 +#FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64 #ARG CMAKE_VERSION #COPY ./scripts/rh_linux_deps.sh / #RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -#ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +#ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH #COPY --from=llm-code / /go/src/github.com/ollama/ollama/ #WORKDIR /go/src/github.com/ollama/ollama/llm/generate #ARG CGO_CFLAGS #ARG CUDA_V11_ARCHITECTURES -#ENV GOARCH arm64 +#ENV GOARCH=arm64 #RUN OLLAMA_SKIP_STATIC_GENERATE=1 \ # OLLAMA_SKIP_CPU_GENERATE=1 \ # CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ # CUDA_VARIANT="_v11" \ # bash gen_linux.sh -#FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64 +#FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64 #ARG CMAKE_VERSION #COPY ./scripts/rh_linux_deps.sh / #RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -#ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +#ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH #COPY --from=llm-code / /go/src/github.com/ollama/ollama/ #WORKDIR /go/src/github.com/ollama/ollama/llm/generate #ARG CGO_CFLAGS #ARG CUDA_V12_ARCHITECTURES -#ENV GOARCH arm64 +#ENV GOARCH=arm64 #RUN --mount=type=cache,target=/root/.ccache \ # OLLAMA_SKIP_STATIC_GENERATE=1 \ # OLLAMA_SKIP_CPU_GENERATE=1 \ @@ -86,13 +86,13 @@ FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-b ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH -ENV LIBRARY_PATH /opt/amdgpu/lib64 +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV LIBRARY_PATH=/opt/amdgpu/lib64 COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS -ENV GOARCH amd64 +ENV GOARCH=amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \ @@ -103,11 +103,11 @@ ARG CMAKE_VERSION ARG GOLANG_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS -ENV GOARCH amd64 +ENV GOARCH=amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 @@ -128,11 +128,11 @@ RUN --mount=type=cache,target=/root/.ccache \ #ARG GOLANG_VERSION #COPY ./scripts/rh_linux_deps.sh / #RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh -#ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +#ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH #COPY --from=llm-code / /go/src/github.com/ollama/ollama/ #ARG OLLAMA_CUSTOM_CPU_DEFS #ARG CGO_CFLAGS -#ENV GOARCH arm64 +#ENV GOARCH=arm64 #WORKDIR /go/src/github.com/ollama/ollama/llm/generate #FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 @@ -143,73 +143,112 @@ RUN --mount=type=cache,target=/root/.ccache \ # OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh -# Intermediate stage used for ./scripts/build_linux.sh +# Intermediate stages used for ./scripts/build_linux.sh FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64 -ENV CGO_ENABLED 1 +ENV CGO_ENABLED=1 WORKDIR /go/src/github.com/ollama/ollama COPY . . -COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/ +COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ +COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ +ARG GOFLAGS +ARG CGO_CFLAGS +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-amd64/bin/ollama . +RUN cd dist/linux-$GOARCH && \ + tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz +RUN cd dist/linux-$GOARCH-rocm && \ + tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz + +#FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 +#ENV CGO_ENABLED=1 +#ARG GOLANG_VERSION +#WORKDIR /go/src/github.com/ollama/ollama +#COPY . . +#COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/ +#COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +#COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/ +#COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +#COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/ +#ARG GOFLAGS +#ARG CGO_CFLAGS +#RUN --mount=type=cache,target=/root/.ccache \ +# go build -trimpath -o dist/linux-arm64/bin/ollama . +#RUN cd dist/linux-$GOARCH && \ +# tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz + +#FROM --platform=linux/amd64 scratch AS dist-amd64 +#COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz / +#FROM --platform=linux/arm64 scratch AS dist-arm64 +#COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz / +#FROM dist-$TARGETARCH as dist + + +# Optimized container images do not cary nested payloads +FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64 +WORKDIR /go/src/github.com/ollama/ollama +COPY . . ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ go build -trimpath -o dist/linux-amd64/bin/ollama . -# Intermediate stage used for ./scripts/build_linux.sh -#FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 -#ENV CGO_ENABLED 1 -#ARG GOLANG_VERSION -#WORKDIR /go/src/github.com/ollama/ollama -#COPY . . -#COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -#COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -#COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -#COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -#COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -#ARG GOFLAGS -#ARG CGO_CFLAGS -#RUN --mount=type=cache,target=/root/.ccache \ -# go build -trimpath -o dist/linux-arm64/bin/ollama . +FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64 +WORKDIR /go/src/github.com/ollama/ollama +COPY . . +ARG GOFLAGS +ARG CGO_CFLAGS +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-arm64/bin/ollama . -# Strip out ROCm dependencies to keep the primary image lean -FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/ -RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* - -# Runtime stages -FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 -COPY --from=amd64-libs-without-rocm /scratch/ /lib/ -RUN apt-get update && apt-get install -y ca-certificates && \ +FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64 +RUN apt-get update && \ + apt-get install -y ca-certificates && \ apt-get clean && rm -rf /var/lib/apt/lists/* -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ -#FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 -#COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ -#RUN apt-get update && apt-get install -y ca-certificates && \ +#FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64 +#RUN apt-get update && \ +# apt-get install -y ca-certificates && \ # apt-get clean && rm -rf /var/lib/apt/lists/* -#COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ +#COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ +#COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ +#COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ +#COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ -# Radeon images are much larger so we keep it distinct from the CPU/CUDA image -FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm -RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ -RUN ln -s /opt/rocm/lib /lib/ollama +# ROCm libraries larger so we keep it distinct from the CPU/CUDA image +FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm +# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer +# across releases +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/ +RUN apt-get update && \ + apt-get install -y ca-certificates && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ EXPOSE 11434 -ENV OLLAMA_HOST 0.0.0.0 +ENV OLLAMA_HOST=0.0.0.0 ENTRYPOINT ["/bin/ollama"] CMD ["serve"] FROM runtime-$TARGETARCH EXPOSE 11434 -ENV OLLAMA_HOST 0.0.0.0 +ENV OLLAMA_HOST=0.0.0.0 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility diff --git a/README.md b/README.md index 181694ef..ff559372 100644 --- a/README.md +++ b/README.md @@ -498,6 +498,8 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support) - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption) +- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library) +- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama) ### Terminal @@ -522,6 +524,8 @@ See the [API documentation](./docs/api.md) for all endpoints. - [podman-ollama](https://github.com/ericcurtin/podman-ollama) - [gollama](https://github.com/sammcj/gollama) - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/) +- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe) +- [vim-intelligence-bridge](https://github.com/pepo-ec/vim-intelligence-bridge) Simple interaction of "Ollama" with the Vim editor ### Apple Vision Pro - [Enchanted](https://github.com/AugustDev/enchanted) @@ -544,6 +548,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa) - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama) +- [crewAI](https://github.com/crewAIInc/crewAI) - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example) - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java) - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs) @@ -575,6 +580,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient) - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun) - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php) +- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama) ### Mobile @@ -605,6 +611,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama) - [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face) - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension) +- [Plasmoid Ollama Control](https://github.com/imoize/plasmoid-ollamacontrol) (KDE Plasma extension that allows you to quickly manage/control Ollama model) - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend) - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support) - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation) @@ -612,6 +619,8 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server) - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links) - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality) +- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator) +- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator) ### Supported backends diff --git a/build/darwin/amd64/placeholder b/build/darwin/amd64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/darwin/amd64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/build/darwin/arm64/placeholder b/build/darwin/arm64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/darwin/arm64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/build/embed_darwin_amd64.go b/build/embed_darwin_amd64.go new file mode 100644 index 00000000..af1458ea --- /dev/null +++ b/build/embed_darwin_amd64.go @@ -0,0 +1,8 @@ +package build + +import "embed" + +// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling + +//go:embed darwin/amd64/* +var EmbedFS embed.FS diff --git a/build/embed_darwin_arm64.go b/build/embed_darwin_arm64.go new file mode 100644 index 00000000..d885365d --- /dev/null +++ b/build/embed_darwin_arm64.go @@ -0,0 +1,8 @@ +package build + +import "embed" + +// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling + +//go:embed darwin/arm64/* +var EmbedFS embed.FS diff --git a/build/embed_linux.go b/build/embed_linux.go new file mode 100644 index 00000000..4cf7be4c --- /dev/null +++ b/build/embed_linux.go @@ -0,0 +1,6 @@ +package build + +import "embed" + +//go:embed linux/* +var EmbedFS embed.FS diff --git a/build/embed_unused.go b/build/embed_unused.go new file mode 100644 index 00000000..00fbe02e --- /dev/null +++ b/build/embed_unused.go @@ -0,0 +1,8 @@ +//go:build !linux && !darwin + +package build + +import "embed" + +// unused on windows +var EmbedFS embed.FS diff --git a/build/linux/amd64/placeholder b/build/linux/amd64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/linux/amd64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/build/linux/arm64/placeholder b/build/linux/arm64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/linux/arm64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/cmd/cmd.go b/cmd/cmd.go index 5de1ed1b..3bb8b06e 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -2,6 +2,7 @@ package cmd import ( "archive/zip" + "bufio" "bytes" "context" "crypto/ed25519" @@ -21,6 +22,7 @@ import ( "regexp" "runtime" "slices" + "strconv" "strings" "sync/atomic" "syscall" @@ -344,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) { return len(p), nil } +func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error { + p := progress.NewProgress(os.Stderr) + defer p.StopAndClear() + + spinner := progress.NewSpinner("") + p.Add("", spinner) + + client, err := api.ClientFromEnvironment() + if err != nil { + return err + } + + req := &api.GenerateRequest{ + Model: opts.Model, + KeepAlive: opts.KeepAlive, + } + + return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil }) +} + +func StopHandler(cmd *cobra.Command, args []string) error { + opts := &runOptions{ + Model: args[0], + KeepAlive: &api.Duration{Duration: 0}, + } + if err := loadOrUnloadModel(cmd, opts); err != nil { + if strings.Contains(err.Error(), "not found") { + return fmt.Errorf("couldn't find model \"%s\" to stop", args[0]) + } + } + return nil +} + func RunHandler(cmd *cobra.Command, args []string) error { interactive := true @@ -422,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error { opts.ParentModel = info.Details.ParentModel if interactive { - if err := loadModel(cmd, &opts); err != nil { + if err := loadOrUnloadModel(cmd, &opts); err != nil { return err } @@ -578,7 +613,7 @@ func ListHandler(cmd *cobra.Command, args []string) error { table.SetHeaderLine(false) table.SetBorder(false) table.SetNoWhiteSpace(true) - table.SetTablePadding("\t") + table.SetTablePadding(" ") table.AppendBulk(data) table.Render() @@ -613,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error { cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100) procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent)) } - data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")}) + + var until string + delta := time.Since(m.ExpiresAt) + if delta > 0 { + until = "Stopping..." + } else { + until = format.HumanTime(m.ExpiresAt, "Never") + } + data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until}) } } @@ -624,7 +667,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error { table.SetHeaderLine(false) table.SetBorder(false) table.SetNoWhiteSpace(true) - table.SetTablePadding("\t") + table.SetTablePadding(" ") table.AppendBulk(data) table.Render() @@ -720,125 +763,89 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return nil } - showInfo(resp) - - return nil + return showInfo(resp, os.Stdout) } -func showInfo(resp *api.ShowResponse) { - modelData := [][]string{ - {"parameters", resp.Details.ParameterSize}, - {"quantization", resp.Details.QuantizationLevel}, - } - if resp.ModelInfo != nil { - arch := resp.ModelInfo["general.architecture"].(string) - modelData = append(modelData, - []string{"arch", arch}, - []string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))}, - []string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))}, - ) +func showInfo(resp *api.ShowResponse, w io.Writer) error { + tableRender := func(header string, rows func() [][]string) { + fmt.Fprintln(w, " ", header) + table := tablewriter.NewWriter(w) + table.SetAlignment(tablewriter.ALIGN_LEFT) + table.SetBorder(false) + table.SetNoWhiteSpace(true) + table.SetTablePadding(" ") + + switch header { + case "Template", "System", "License": + table.SetColWidth(100) + } + + table.AppendBulk(rows()) + table.Render() + fmt.Fprintln(w) } - mainTableData := [][]string{ - {"Model"}, - {renderSubTable(modelData, false)}, - } + tableRender("Model", func() (rows [][]string) { + if resp.ModelInfo != nil { + arch := resp.ModelInfo["general.architecture"].(string) + rows = append(rows, []string{"", "architecture", arch}) + rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))}) + rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)}) + rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)}) + } else { + rows = append(rows, []string{"", "architecture", resp.Details.Family}) + rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize}) + } + rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel}) + return + }) if resp.ProjectorInfo != nil { - projectorData := [][]string{ - {"arch", "clip"}, - {"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}, - } - - if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok { - projectorData = append(projectorData, []string{"projector type", projectorType.(string)}) - } - - projectorData = append(projectorData, - []string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))}, - []string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))}, - ) - - mainTableData = append(mainTableData, - []string{"Projector"}, - []string{renderSubTable(projectorData, false)}, - ) + tableRender("Projector", func() (rows [][]string) { + arch := resp.ProjectorInfo["general.architecture"].(string) + rows = append(rows, []string{"", "architecture", arch}) + rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}) + rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)}) + rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)}) + return + }) } if resp.Parameters != "" { - mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)}) + tableRender("Parameters", func() (rows [][]string) { + scanner := bufio.NewScanner(strings.NewReader(resp.Parameters)) + for scanner.Scan() { + if text := scanner.Text(); text != "" { + rows = append(rows, append([]string{""}, strings.Fields(text)...)) + } + } + return + }) + } + + head := func(s string, n int) (rows [][]string) { + scanner := bufio.NewScanner(strings.NewReader(s)) + for scanner.Scan() && (len(rows) < n || n < 0) { + if text := scanner.Text(); text != "" { + rows = append(rows, []string{"", strings.TrimSpace(text)}) + } + } + return } if resp.System != "" { - mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)}) + tableRender("System", func() [][]string { + return head(resp.System, 2) + }) } if resp.License != "" { - mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)}) + tableRender("License", func() [][]string { + return head(resp.License, 2) + }) } - table := tablewriter.NewWriter(os.Stdout) - table.SetAutoWrapText(false) - table.SetBorder(false) - table.SetAlignment(tablewriter.ALIGN_LEFT) - - for _, v := range mainTableData { - table.Append(v) - } - - table.Render() -} - -func renderSubTable(data [][]string, file bool) string { - var buf bytes.Buffer - table := tablewriter.NewWriter(&buf) - table.SetAutoWrapText(!file) - table.SetBorder(false) - table.SetNoWhiteSpace(true) - table.SetTablePadding("\t") - table.SetAlignment(tablewriter.ALIGN_LEFT) - - for _, v := range data { - table.Append(v) - } - - table.Render() - - renderedTable := buf.String() - lines := strings.Split(renderedTable, "\n") - for i, line := range lines { - lines[i] = "\t" + line - } - - return strings.Join(lines, "\n") -} - -func twoLines(s string) [][]string { - lines := strings.Split(s, "\n") - res := [][]string{} - - count := 0 - for _, line := range lines { - line = strings.TrimSpace(line) - if line != "" { - count++ - res = append(res, []string{line}) - if count == 2 { - return res - } - } - } - return res -} - -func formatParams(s string) string { - lines := strings.Split(s, "\n") - table := [][]string{} - - for _, line := range lines { - table = append(table, strings.Fields(line)) - } - return renderSubTable(table, false) + return nil } func CopyHandler(cmd *cobra.Command, args []string) error { @@ -1328,6 +1335,15 @@ func NewCLI() *cobra.Command { runCmd.Flags().Bool("insecure", false, "Use an insecure registry") runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically") runCmd.Flags().String("format", "", "Response format (e.g. json)") + + stopCmd := &cobra.Command{ + Use: "stop MODEL", + Short: "Stop a running model", + Args: cobra.ExactArgs(1), + PreRunE: checkServerHeartbeat, + RunE: StopHandler, + } + serveCmd := &cobra.Command{ Use: "serve", Aliases: []string{"start"}, @@ -1395,6 +1411,7 @@ func NewCLI() *cobra.Command { createCmd, showCmd, runCmd, + stopCmd, pullCmd, pushCmd, listCmd, @@ -1434,6 +1451,7 @@ func NewCLI() *cobra.Command { createCmd, showCmd, runCmd, + stopCmd, pullCmd, pushCmd, listCmd, diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go new file mode 100644 index 00000000..0f8863cc --- /dev/null +++ b/cmd/cmd_test.go @@ -0,0 +1,206 @@ +package cmd + +import ( + "bytes" + "os" + "path/filepath" + "testing" + + "github.com/google/go-cmp/cmp" + + "github.com/ollama/ollama/api" +) + +func TestShowInfo(t *testing.T) { + t.Run("bare details", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + +` + + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("bare model info", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + ModelInfo: map[string]any{ + "general.architecture": "test", + "general.parameter_count": float64(7_000_000_000), + "test.context_length": float64(0), + "test.embedding_length": float64(0), + }, + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + context length 0 + embedding length 0 + quantization FP16 + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("parameters", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + Parameters: ` + stop never + stop gonna + stop give + stop you + stop up + temperature 99`, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + Parameters + stop never + stop gonna + stop give + stop you + stop up + temperature 99 + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("project info", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + ProjectorInfo: map[string]any{ + "general.architecture": "clip", + "general.parameter_count": float64(133_700_000), + "clip.vision.embedding_length": float64(0), + "clip.vision.projection_dim": float64(0), + }, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + Projector + architecture clip + parameters 133.70M + embedding length 0 + dimensions 0 + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("system", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + System: `You are a pirate! +Ahoy, matey! +Weigh anchor! + `, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + System + You are a pirate! + Ahoy, matey! + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("license", func(t *testing.T) { + var b bytes.Buffer + license, err := os.ReadFile(filepath.Join("..", "LICENSE")) + if err != nil { + t.Fatal(err) + } + + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + License: string(license), + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + License + MIT License + Copyright (c) Ollama + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) +} diff --git a/cmd/interactive.go b/cmd/interactive.go index 4462cf29..94578f11 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -18,7 +18,6 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/parser" - "github.com/ollama/ollama/progress" "github.com/ollama/ollama/readline" "github.com/ollama/ollama/types/errtypes" ) @@ -31,26 +30,6 @@ const ( MultilineSystem ) -func loadModel(cmd *cobra.Command, opts *runOptions) error { - p := progress.NewProgress(os.Stderr) - defer p.StopAndClear() - - spinner := progress.NewSpinner("") - p.Add("", spinner) - - client, err := api.ClientFromEnvironment() - if err != nil { - return err - } - - chatReq := &api.ChatRequest{ - Model: opts.Model, - KeepAlive: opts.KeepAlive, - } - - return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil }) -} - func generateInteractive(cmd *cobra.Command, opts runOptions) error { usage := func() { fmt.Fprintln(os.Stderr, "Available Commands:") @@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { opts.Model = args[1] opts.Messages = []api.Message{} fmt.Printf("Loading model '%s'\n", opts.Model) - if err := loadModel(cmd, &opts); err != nil { + if err := loadOrUnloadModel(cmd, &opts); err != nil { return err } continue @@ -371,7 +350,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { switch args[1] { case "info": - showInfo(resp) + _ = showInfo(resp, os.Stderr) case "license": if resp.License == "" { fmt.Println("No license was specified for this model.") diff --git a/convert/convert.go b/convert/convert.go index 8c7b0943..44783b6e 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -208,14 +208,18 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { return err } - if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) { - slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens)) + vocabSize := int(p.VocabSize) + switch { + case vocabSize > len(t.Vocabulary.Tokens): + slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens)) for i := range vocabSize - len(t.Vocabulary.Tokens) { t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i)) t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1) t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined) } - } else { + case vocabSize < len(t.Vocabulary.Tokens): + return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize) + default: slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } diff --git a/docs/api.md b/docs/api.md index aed2b69f..1ae60dc7 100644 --- a/docs/api.md +++ b/docs/api.md @@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?" }' ``` @@ -80,7 +80,7 @@ A stream of JSON objects is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T08:52:19.385406455-07:00", "response": "The", "done": false @@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s), ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "response": "", "done": true, @@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off. ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?", "stream": false }' @@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "response": "The sky is blue because it is the color of the sky.", "done": true, @@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{ ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "What color is the sky at different times of the day? Respond using JSON", "format": "json", "stream": false @@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{ ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-11-09T21:07:55.186497Z", "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n", "done": true, @@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?", "stream": false, "options": { @@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{ ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "response": "The sky is blue because it is the color of the sky.", "done": true, @@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory. ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3" + "model": "llama3.1" }' ``` @@ -400,7 +400,7 @@ A single JSON object is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-12-18T19:52:07.071755Z", "response": "", "done": true @@ -445,7 +445,7 @@ Send a chat message with a streaming response. ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -461,7 +461,7 @@ A stream of JSON objects is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T08:52:19.385406455-07:00", "message": { "role": "assistant", @@ -476,7 +476,7 @@ Final response: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "done": true, "total_duration": 4883583458, @@ -494,7 +494,7 @@ Final response: ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -509,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{ ```json { - "model": "registry.ollama.ai/library/llama3:latest", + "model": "llama3.1", "created_at": "2023-12-12T14:13:43.416799Z", "message": { "role": "assistant", @@ -533,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -557,7 +557,7 @@ A stream of JSON objects is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T08:52:19.385406455-07:00", "message": { "role": "assistant", @@ -571,7 +571,7 @@ Final response: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "done": true, "total_duration": 8113331500, @@ -629,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{ ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -647,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{ ```json { - "model": "registry.ollama.ai/library/llama3:latest", + "model": "llama3.1", "created_at": "2023-12-12T14:13:43.416799Z", "message": { "role": "assistant", @@ -904,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter ```shell curl http://localhost:11434/api/show -d '{ - "name": "llama3" + "name": "llama3.1" }' ``` @@ -965,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model. ```shell curl http://localhost:11434/api/copy -d '{ - "source": "llama3", + "source": "llama3.1", "destination": "llama3-backup" }' ``` @@ -1020,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where ```shell curl http://localhost:11434/api/pull -d '{ - "name": "llama3" + "name": "llama3.1" }' ``` diff --git a/docs/faq.md b/docs/faq.md index 356d5105..6267ad2b 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter: ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?", "options": { "num_ctx": 4096 @@ -247,12 +247,12 @@ The `keep_alive` parameter can be set to: For example, to preload a model and leave it in memory use: ```shell -curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}' +curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}' ``` To unload the model and free up memory use: ```shell -curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}' +curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}' ``` Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable. diff --git a/docs/import.md b/docs/import.md index 1a90bc48..2346886f 100644 --- a/docs/import.md +++ b/docs/import.md @@ -38,7 +38,7 @@ Ollama supports importing adapters based on several different model architecture You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as: - * Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training) + * Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training) * [Unsloth](https://github.com/unslothai/unsloth) * [MLX](https://github.com/ml-explore/mlx) diff --git a/docs/linux.md b/docs/linux.md index 46c17a20..0eec014f 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -1,43 +1,57 @@ -# Ollama on Linux +# Linux ## Install -Install Ollama running this one-liner: +To install Ollama, run the following command: -> - -```bash +```shell curl -fsSL https://ollama.com/install.sh | sh ``` -## AMD Radeon GPU support - -While AMD has contributed the `amdgpu` driver upstream to the official linux -kernel source, the version is older and may not support all ROCm features. We -recommend you install the latest driver from -https://www.amd.com/en/support/linux-drivers for best support of your Radeon -GPU. - ## Manual install -### Download `ollama` +Download and extract the package: -Download and extract the Linux package: - -```bash -curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr +```shell +curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz +sudo tar -C /usr -xzf ollama-linux-amd64.tgz ``` -If you have an AMD GPU, also download and extract the ROCm package into the same location -```bash -curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz | sudo tar zx -C /usr +Start Ollama: + +```shell +ollama serve +``` + +In another terminal, verify that Ollama is running: + +```shell +ollama -v +``` + +### AMD GPU install + +If you have an AMD GPU, also download and extract the additional ROCm package: + +```shell +curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz +sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz +``` + +### ARM64 install + +Download and extract the ARM64-specific package: + +```shell +curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz +sudo tar -C /usr -xzf ollama-linux-arm64.tgz ``` ### Adding Ollama as a startup service (recommended) Create a user and group for Ollama: -```bash +```shell sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama sudo usermod -a -G ollama $(whoami) ``` @@ -63,47 +77,54 @@ WantedBy=default.target Then start the service: -```bash +```shell sudo systemctl daemon-reload sudo systemctl enable ollama ``` -### Install CUDA drivers (optional – for Nvidia GPUs) +### Install CUDA drivers (optional) [Download and install](https://developer.nvidia.com/cuda-downloads) CUDA. Verify that the drivers are installed by running the following command, which should print details about your GPU: -```bash +```shell nvidia-smi ``` -### Install ROCm (optional - for Radeon GPUs) -[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) +### Install AMD ROCm drivers (optional) -Make sure to install ROCm v6 +[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6. ### Start Ollama Start Ollama and verify it is running: -```bash +```shell sudo systemctl start ollama sudo systemctl status ollama ``` -## Update +> [!NOTE] +> While AMD has contributed the `amdgpu` driver upstream to the official linux +> kernel source, the version is older and may not support all ROCm features. We +> recommend you install the latest driver from +> https://www.amd.com/en/support/linux-drivers for best support of your Radeon +> GPU. -Update ollama by running the install script again: +## Updating -```bash +Update Ollama by running the install script again: + +```shell curl -fsSL https://ollama.com/install.sh | sh ``` -Or by downloading the ollama binary: +Or by re-downloading Ollama: -```bash -curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr +```shell +curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz +sudo tar -C /usr -xzf ollama-linux-amd64.tgz ``` ## Installing specific versions @@ -112,15 +133,15 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s For example: -``` -curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh +```shell +curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh ``` ## Viewing logs To view logs of Ollama running as a startup service, run: -```bash +```shell journalctl -e -u ollama ``` @@ -128,7 +149,7 @@ journalctl -e -u ollama Remove the ollama service: -```bash +```shell sudo systemctl stop ollama sudo systemctl disable ollama sudo rm /etc/systemd/system/ollama.service @@ -136,13 +157,13 @@ sudo rm /etc/systemd/system/ollama.service Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`): -```bash +```shell sudo rm $(which ollama) ``` Remove the downloaded models and Ollama service user and group: -```bash +```shell sudo rm -r /usr/share/ollama sudo userdel ollama sudo groupdel ollama diff --git a/docs/modelfile.md b/docs/modelfile.md index 92df22ef..a33f180b 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -11,7 +11,7 @@ A model file is the blueprint to create and share models with Ollama. - [Examples](#examples) - [Instructions](#instructions) - [FROM (Required)](#from-required) - - [Build from llama3.1](#build-from-llama31) + - [Build from existing model](#build-from-existing-model) - [Build from a Safetensors model](#build-from-a-safetensors-model) - [Build from a GGUF file](#build-from-a-gguf-file) - [PARAMETER](#parameter) @@ -50,7 +50,7 @@ INSTRUCTION arguments An example of a `Modelfile` creating a mario blueprint: ```modelfile -FROM llama3 +FROM llama3.1 # sets the temperature to 1 [higher is more creative, lower is more coherent] PARAMETER temperature 1 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token @@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples). To view the Modelfile of a given model, use the `ollama show --modelfile` command. ```bash - > ollama show --modelfile llama3 + > ollama show --modelfile llama3.1 # Modelfile generated by "ollama show" # To build a new Modelfile based on this one, replace the FROM line with: - # FROM llama3:latest + # FROM llama3.1:latest FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29 TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|> @@ -100,7 +100,7 @@ The `FROM` instruction defines the base model to use when creating a model. FROM : ``` -#### Build from llama3.1 +#### Build from existing model ```modelfile FROM llama3.1 diff --git a/docs/openai.md b/docs/openai.md index 0cbea6cc..c6df0fec 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create( 'content': 'Say this is a test', } ], - model='llama3', + model='llama3.1', ) response = client.chat.completions.create( @@ -46,13 +46,13 @@ response = client.chat.completions.create( ) completion = client.completions.create( - model="llama3", + model="llama3.1", prompt="Say this is a test", ) list_completion = client.models.list() -model = client.models.retrieve("llama3") +model = client.models.retrieve("llama3.1") embeddings = client.embeddings.create( model="all-minilm", @@ -74,7 +74,7 @@ const openai = new OpenAI({ const chatCompletion = await openai.chat.completions.create({ messages: [{ role: 'user', content: 'Say this is a test' }], - model: 'llama3', + model: 'llama3.1', }) const response = await openai.chat.completions.create({ @@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({ }) const completion = await openai.completions.create({ - model: "llama3", + model: "llama3.1", prompt: "Say this is a test.", }) const listCompletion = await openai.models.list() -const model = await openai.models.retrieve("llama3") +const model = await openai.models.retrieve("llama3.1") const embedding = await openai.embeddings.create({ model: "all-minilm", @@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({ curl http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "system", @@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \ curl http://localhost:11434/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Say this is a test" }' curl http://localhost:11434/v1/models -curl http://localhost:11434/v1/models/llama3 +curl http://localhost:11434/v1/models/llama3.1 curl http://localhost:11434/v1/embeddings \ -H "Content-Type: application/json" \ @@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \ Before using a model, pull it locally `ollama pull`: ```shell -ollama pull llama3 +ollama pull llama3.1 ``` ### Default model names @@ -282,7 +282,7 @@ ollama pull llama3 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name: ``` -ollama cp llama3 gpt-3.5-turbo +ollama cp llama3.1 gpt-3.5-turbo ``` Afterwards, this new model name can be specified the `model` field: diff --git a/docs/template.md b/docs/template.md index 1d7104de..192d878d 100644 --- a/docs/template.md +++ b/docs/template.md @@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3. ```dockerfile -FROM llama3 +FROM llama3.1 TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|> diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 589061a8..0a89b87f 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -91,6 +91,17 @@ If none of those resolve the problem, gather additional information and file an - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` +## AMD GPU Discovery + +On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log. + +When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. + +If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure. +- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems +- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported +- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd` + ## Windows Terminal Errors Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer. diff --git a/docs/windows.md b/docs/windows.md index f681ffac..372a35aa 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn Here's a quick example showing API access from `powershell` ```powershell -(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json +(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json ``` ## Troubleshooting diff --git a/envconfig/config.go b/envconfig/config.go index 14e3cb0c..9c1490a9 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -179,53 +179,6 @@ var ( HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION") ) -func RunnersDir() (p string) { - if p := Var("OLLAMA_RUNNERS_DIR"); p != "" { - return p - } - - if runtime.GOOS != "windows" { - return - } - - defer func() { - if p == "" { - slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'") - } - }() - - // On Windows we do not carry the payloads inside the main executable - exe, err := os.Executable() - if err != nil { - return - } - - cwd, err := os.Getwd() - if err != nil { - return - } - - var paths []string - for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} { - paths = append(paths, - root, - filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), - filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), - ) - } - - // Try a few variations to improve developer experience when building from source in the local tree - for _, path := range paths { - candidate := filepath.Join(path, "lib", "ollama", "runners") - if _, err := os.Stat(candidate); err == nil { - p = candidate - break - } - } - - return p -} - func Uint(key string, defaultValue uint) func() uint { return func() uint { if s := Var(key); s != "" { @@ -290,10 +243,22 @@ func AsMap() map[string]EnvVar { "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"}, - "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"}, + + // Informational + "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, + "HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"}, + "NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"}, } + + if runtime.GOOS != "windows" { + // Windows environment variables are case-insensitive so there's no need to duplicate them + ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"} + ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"} + ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"} + } + if runtime.GOOS != "darwin" { ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"} ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"} @@ -302,6 +267,7 @@ func AsMap() map[string]EnvVar { ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"} } + return ret } diff --git a/examples/langchain-python-rag-privategpt/requirements.txt b/examples/langchain-python-rag-privategpt/requirements.txt index 0aad1fe5..4f2cee25 100644 --- a/examples/langchain-python-rag-privategpt/requirements.txt +++ b/examples/langchain-python-rag-privategpt/requirements.txt @@ -1,6 +1,6 @@ langchain==0.0.274 gpt4all==1.0.8 -chromadb==0.4.7 +chromadb==0.5.0 llama-cpp-python==0.1.81 urllib3==2.0.4 PyMuPDF==1.23.5 @@ -12,4 +12,4 @@ pandoc==2.3 pypandoc==1.11 tqdm==4.66.1 sentence_transformers==2.2.2 -numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability \ No newline at end of file +numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/examples/python-loganalysis/Modelfile b/examples/python-loganalysis/Modelfile index 5237cb6e..b28aa0c0 100644 --- a/examples/python-loganalysis/Modelfile +++ b/examples/python-loganalysis/Modelfile @@ -4,5 +4,5 @@ SYSTEM """ You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer. """ -PARAMETER TEMPERATURE 0.3 +PARAMETER temperature 0.3 diff --git a/examples/python-loganalysis/readme.md b/examples/python-loganalysis/readme.md index 4be0baaa..03bab672 100644 --- a/examples/python-loganalysis/readme.md +++ b/examples/python-loganalysis/readme.md @@ -21,6 +21,8 @@ You can try this with the `logtest.logfile` file included in this directory. 2. Install the Python Requirements. ```bash + python3 -m venv .venv + source .venv/bin/activate pip install -r requirements.txt ``` diff --git a/examples/python-loganalysis/requirements.txt b/examples/python-loganalysis/requirements.txt index 9688b8ec..e7cb17ef 100644 --- a/examples/python-loganalysis/requirements.txt +++ b/examples/python-loganalysis/requirements.txt @@ -1 +1 @@ -Requests==2.31.0 +Requests>=2.32.3 diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 33300eb6..3db201b8 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "io/fs" "log/slog" "os" "path/filepath" @@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo { if len(resp) == 0 { slog.Info("no compatible amdgpu devices detected") } + if err := verifyKFDDriverAccess(); err != nil { + slog.Error("amdgpu devices detected but permission problems block access", "error", err) + return nil + } return resp } @@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) { } return usedMemory, nil } + +func verifyKFDDriverAccess() error { + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err) + } else if errors.Is(err, fs.ErrNotExist) { + // Container runtime failure? + return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'") + } + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) + } + fd.Close() + return nil +} diff --git a/gpu/assets.go b/gpu/assets.go deleted file mode 100644 index 6d62d0dc..00000000 --- a/gpu/assets.go +++ /dev/null @@ -1,148 +0,0 @@ -package gpu - -import ( - "errors" - "fmt" - "log/slog" - "os" - "path/filepath" - "runtime" - "strconv" - "strings" - "sync" - "syscall" - "time" - - "github.com/ollama/ollama/envconfig" -) - -var ( - lock sync.Mutex - payloadsDir = "" -) - -func PayloadsDir() (string, error) { - lock.Lock() - defer lock.Unlock() - var err error - if payloadsDir == "" { - runnersDir := envconfig.RunnersDir() - - if runnersDir != "" { - payloadsDir = runnersDir - return payloadsDir, nil - } - - // The remainder only applies on non-windows where we still carry payloads in the main executable - cleanupTmpDirs() - tmpDir := envconfig.TmpDir() - if tmpDir == "" { - tmpDir, err = os.MkdirTemp("", "ollama") - if err != nil { - return "", fmt.Errorf("failed to generate tmp dir: %w", err) - } - } else { - err = os.MkdirAll(tmpDir, 0o755) - if err != nil { - return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err) - } - } - - // Track our pid so we can clean up orphaned tmpdirs - n := filepath.Join(tmpDir, "ollama.pid") - if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { - return "", fmt.Errorf("failed to write pid file %s: %w", n, err) - } - - // We create a distinct subdirectory for payloads within the tmpdir - // This will typically look like /tmp/ollama3208993108/runners on linux - payloadsDir = filepath.Join(tmpDir, "runners") - } - return payloadsDir, nil -} - -// Best effort to clean up prior tmpdirs -func cleanupTmpDirs() { - matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid")) - if err != nil { - return - } - - for _, match := range matches { - raw, err := os.ReadFile(match) - if errors.Is(err, os.ErrNotExist) { - slog.Debug("not a ollama runtime directory, skipping", "path", match) - continue - } else if err != nil { - slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err) - continue - } - - pid, err := strconv.Atoi(string(raw)) - if err != nil { - slog.Warn("invalid pid, skipping", "path", match, "error", err) - continue - } - - p, err := os.FindProcess(pid) - if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) { - slog.Warn("process still running, skipping", "pid", pid, "path", match) - continue - } - - if err := os.Remove(match); err != nil { - slog.Warn("could not cleanup stale pidfile", "path", match, "error", err) - } - - runners := filepath.Join(filepath.Dir(match), "runners") - if err := os.RemoveAll(runners); err != nil { - slog.Warn("could not cleanup stale runners", "path", runners, "error", err) - } - - if err := os.Remove(filepath.Dir(match)); err != nil { - slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err) - } - } -} - -func Cleanup() { - lock.Lock() - defer lock.Unlock() - runnersDir := envconfig.RunnersDir() - if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" { - // We want to fully clean up the tmpdir parent of the payloads dir - tmpDir := filepath.Clean(filepath.Join(payloadsDir, "..")) - slog.Debug("cleaning up", "dir", tmpDir) - err := os.RemoveAll(tmpDir) - if err != nil { - // On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove - time.Sleep(1000 * time.Millisecond) - err = os.RemoveAll(tmpDir) - if err != nil { - slog.Warn("failed to clean up", "dir", tmpDir, "err", err) - } - } - } -} - -func UpdatePath(dir string) { - if runtime.GOOS == "windows" { - tmpDir := filepath.Dir(dir) - pathComponents := strings.Split(os.Getenv("PATH"), ";") - i := 0 - for _, comp := range pathComponents { - if strings.EqualFold(comp, dir) { - return - } - // Remove any other prior paths to our temp dir - if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) { - pathComponents[i] = comp - i++ - } - } - newPath := strings.Join(append([]string{dir}, pathComponents...), ";") - slog.Info("updating", "PATH", newPath) - os.Setenv("PATH", newPath) - } - // linux and darwin rely on rpath -} diff --git a/gpu/gpu.go b/gpu/gpu.go index 7b2bd810..6bae571f 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles { localAppData := os.Getenv("LOCALAPPDATA") cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)} } - tmpDir, _ := PayloadsDir() - if tmpDir != "" { - // TODO - add "payloads" for subprocess - cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)} + libDir := LibraryDir() + if libDir != "" { + cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)} } cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index fc673c47..6ce457ae 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -913,7 +913,9 @@ struct llama_server_context slot.sampled = result.tok; // search stop word and delete it - slot.generated_text += token_str; + if (!llama_token_is_eog(model, result.tok)) + slot.generated_text += token_str; + slot.has_next_token = true; if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) @@ -954,30 +956,36 @@ struct llama_server_context if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) - { - is_stop_full = true; - slot.generated_text.erase( - slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } - else - { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); - } - // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) - { - // no send the stop word in the response - result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.n_sent_text += result.text_to_send.size(); - // add the token to slot queue and cache + if (!llama_token_is_eog(model, result.tok)) { + const std::string str_test = slot.generated_text.substr(pos); + bool is_stop_full = false; + size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); + if (stop_pos != std::string::npos) + { + is_stop_full = true; + slot.generated_text.erase( + slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.n_sent_text, slot.generated_text.size()); + } + else + { + is_stop_full = false; + stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); + } + + // check if there is any token to predict + if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) + { + // no send the stop word in the response + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); + // add the token to slot queue and cache + } + } else { + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); } if (slot.params.stream) @@ -1117,9 +1125,7 @@ struct llama_server_context {"multimodal", multimodal} }; - if (!llama_token_is_eog(model, tkn.tok)) { - res.result_json["content"] = tkn.text_to_send; - } + res.result_json["content"] = tkn.text_to_send; if (slot.sparams.n_probs > 0) { diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index cef68ea1..ab5d7612 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -31,6 +31,7 @@ init_vars() { NO_WHOLE_ARCHIVE="" GCC_ARCH="-arch ${ARCH}" DIST_BASE=../../dist/darwin-${GOARCH}/ + PAYLOAD_BASE=../../build/darwin/${GOARCH} ;; "Linux") LIB_EXT="so" @@ -40,6 +41,7 @@ init_vars() { # Cross compiling not supported on linux - Use docker GCC_ARCH="" DIST_BASE=../../dist/linux-${GOARCH}/ + PAYLOAD_BASE=../../build/linux/${GOARCH} ;; *) ;; @@ -47,7 +49,8 @@ init_vars() { if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" fi - GZIP=$(which pigz 2>/dev/null || echo "gzip") + GZIP=$(command -v pigz 2>/dev/null || echo "gzip") + RUNNER_BASE="${DIST_BASE}/lib/ollama/runners" } git_module_setup() { @@ -91,17 +94,34 @@ build() { rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal } -compress() { - echo "Compressing payloads to reduce overall binary size..." - rm -rf ${BUILD_DIR}/bin/*.gz +dist() { + [ -z "${RUNNER}" ] && exit 1 + mkdir -p ${RUNNER_BASE}/${RUNNER}/ for f in ${BUILD_DIR}/bin/* ; do - ${GZIP} -n --best -f ${f} & + cp ${f} ${RUNNER_BASE}/${RUNNER}/ + done + # check for lib directory + if [ -d ${BUILD_DIR}/lib ]; then + for f in ${BUILD_DIR}/lib/* ; do + cp ${f} ${RUNNER_BASE}/${RUNNER}/ + done + fi +} + +# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir +compress() { + [ -z "${RUNNER}" ] && exit 1 + echo "Compressing payloads with ${GZIP} to reduce overall binary size..." + rm -rf "${PAYLOAD_BASE}/${RUNNER}/" + mkdir -p "${PAYLOAD_BASE}/${RUNNER}/" + for f in ${BUILD_DIR}/bin/* ; do + ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" & compress_pids+=" $!" done # check for lib directory if [ -d ${BUILD_DIR}/lib ]; then for f in ${BUILD_DIR}/lib/* ; do - ${GZIP} -n --best -f ${f} & + ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" & compress_pids+=" $!" done fi @@ -117,7 +137,7 @@ wait_for_compress() { install() { echo "Installing libraries to bin dir ${BUILD_DIR}/bin/" - for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do + for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do rm -f "${BUILD_DIR}/bin/$(basename ${lib})" cp -af "${lib}" "${BUILD_DIR}/bin/" done diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index acea9c8d..c37366f3 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -39,7 +39,8 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/cpu" + RUNNER=cpu + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" echo "Building LCD CPU" build sign ${BUILD_DIR}/bin/ollama_llama_server @@ -51,7 +52,8 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" + RUNNER=cpu_avx + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" echo "Building AVX CPU" build sign ${BUILD_DIR}/bin/ollama_llama_server @@ -63,7 +65,8 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" + RUNNER=cpu_avx2 + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" echo "Building AVX2 CPU" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" build @@ -84,7 +87,8 @@ case "${GOARCH}" in if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then init_vars CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/metal" + RUNNER="metal" + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" build sign ${BUILD_DIR}/bin/ollama_llama_server diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index fae3785c..474b67f1 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -80,10 +80,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu" + RUNNER="cpu" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building custom CPU" build install + dist compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 @@ -103,10 +105,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu" + RUNNER=cpu + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building LCD CPU" build install + dist compress fi @@ -121,10 +125,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu_avx" + RUNNER=cpu_avx + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building AVX CPU" build install + dist compress fi @@ -135,10 +141,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" + RUNNER=cpu_avx2 + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building AVX2 CPU" build install + dist compress fi fi @@ -188,11 +196,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then fi export CUDAFLAGS="-t8" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" - BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" + RUNNER=cuda${CUDA_VARIANT} + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" build install + dist echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" mkdir -p "${CUDA_DIST_DIR}" for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do @@ -213,7 +223,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" - BUILD_DIR="../build/linux/${ARCH}/oneapi" + RUNNER=oneapi + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it @@ -232,6 +243,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" install + dist compress fi @@ -260,7 +272,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}" echo "Building custom ROCM GPU" fi - BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" + RUNNER=rocm${ROCM_VARIANT} + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" # ROCm dependencies are too large to fit into a unified bundle ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" # TODO figure out how to disable runpath (rpath) @@ -270,13 +283,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then # copy the ROCM dependencies mkdir -p "${ROCM_DIST_DIR}" - for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do cp -a "${dep}"* "${ROCM_DIST_DIR}" + if [ $(readlink -f "${dep}") != "${dep}" ] ; then + cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}" + fi done install + dist compress fi cleanup wait_for_compress -echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" +echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)" diff --git a/llm/llm_darwin_arm64.go b/llm/llm_darwin.go similarity index 55% rename from llm/llm_darwin_arm64.go rename to llm/llm_darwin.go index 20ce8552..60837ed0 100644 --- a/llm/llm_darwin_arm64.go +++ b/llm/llm_darwin.go @@ -1,11 +1,7 @@ package llm import ( - "embed" "syscall" ) -//go:embed build/darwin/arm64/*/bin/* -var libEmbed embed.FS - var LlamaServerSysProcAttr = &syscall.SysProcAttr{} diff --git a/llm/llm_darwin_amd64.go b/llm/llm_darwin_amd64.go deleted file mode 100644 index 60eed719..00000000 --- a/llm/llm_darwin_amd64.go +++ /dev/null @@ -1,11 +0,0 @@ -package llm - -import ( - "embed" - "syscall" -) - -//go:embed build/darwin/x86_64/*/bin/* -var libEmbed embed.FS - -var LlamaServerSysProcAttr = &syscall.SysProcAttr{} diff --git a/llm/llm_linux.go b/llm/llm_linux.go index 928b4e79..60837ed0 100644 --- a/llm/llm_linux.go +++ b/llm/llm_linux.go @@ -1,11 +1,7 @@ package llm import ( - "embed" "syscall" ) -//go:embed build/linux/*/*/bin/* -var libEmbed embed.FS - var LlamaServerSysProcAttr = &syscall.SysProcAttr{} diff --git a/llm/llm_windows.go b/llm/llm_windows.go index 763cccf9..74a735c2 100644 --- a/llm/llm_windows.go +++ b/llm/llm_windows.go @@ -1,13 +1,9 @@ package llm import ( - "embed" "syscall" ) -// unused on windows -var libEmbed embed.FS - const CREATE_DEFAULT_ERROR_MODE = 0x04000000 var LlamaServerSysProcAttr = &syscall.SysProcAttr{ diff --git a/llm/payload.go b/llm/payload.go deleted file mode 100644 index 963b3295..00000000 --- a/llm/payload.go +++ /dev/null @@ -1,233 +0,0 @@ -package llm - -import ( - "compress/gzip" - "errors" - "fmt" - "io" - "io/fs" - "log/slog" - "os" - "path/filepath" - "runtime" - "slices" - "strings" - - "golang.org/x/sync/errgroup" - - "github.com/ollama/ollama/gpu" -) - -var errPayloadMissing = errors.New("expected payloads not included in this build of ollama") - -func Init() error { - payloadsDir, err := gpu.PayloadsDir() - if err != nil { - return err - } - - if runtime.GOOS != "windows" { - slog.Info("extracting embedded files", "dir", payloadsDir) - binGlob := "build/*/*/*/bin/*" - - // extract server libraries - err = extractFiles(payloadsDir, binGlob) - if err != nil { - return fmt.Errorf("extract binaries: %v", err) - } - } - - var variants []string - for v := range getAvailableServers() { - variants = append(variants, v) - } - slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants)) - slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY") - - return nil -} - -// binary names may contain an optional variant separated by '_' -// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2" -// Any library without a variant is the lowest common denominator -func getAvailableServers() map[string]string { - payloadsDir, err := gpu.PayloadsDir() - if err != nil { - slog.Error("payload lookup error", "error", err) - return nil - } - - // glob payloadsDir for files that start with ollama_ - pattern := filepath.Join(payloadsDir, "*", "ollama_*") - - files, err := filepath.Glob(pattern) - if err != nil { - slog.Debug("could not glob", "pattern", pattern, "error", err) - return nil - } - - servers := make(map[string]string) - for _, file := range files { - slog.Debug("availableServers : found", "file", file) - servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file) - } - - return servers -} - -// serversForGpu returns a list of compatible servers give the provided GPU -// info, ordered by performance. assumes Init() has been called -// TODO - switch to metadata based mapping -func serversForGpu(info gpu.GpuInfo) []string { - // glob workDir for files that start with ollama_ - availableServers := getAvailableServers() - requested := info.Library - if info.Variant != gpu.CPUCapabilityNone.String() { - requested += "_" + info.Variant - } - - servers := []string{} - - // exact match first - for a := range availableServers { - if a == requested { - servers = []string{a} - - if a == "metal" { - return servers - } - - break - } - } - - alt := []string{} - - // Then for GPUs load alternates and sort the list for consistent load ordering - if info.Library != "cpu" { - for a := range availableServers { - if info.Library == strings.Split(a, "_")[0] && a != requested { - alt = append(alt, a) - } - } - - slices.Sort(alt) - servers = append(servers, alt...) - } - - if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { - // Load up the best CPU variant if not primary requested - if info.Library != "cpu" { - variant := gpu.GetCPUCapability() - // If no variant, then we fall back to default - // If we have a variant, try that if we find an exact match - // Attempting to run the wrong CPU instructions will panic the - // process - if variant != gpu.CPUCapabilityNone { - for cmp := range availableServers { - if cmp == "cpu_"+variant.String() { - servers = append(servers, cmp) - break - } - } - } else { - servers = append(servers, "cpu") - } - } - - if len(servers) == 0 { - servers = []string{"cpu"} - } - } - - return servers -} - -// Return the optimal server for this CPU architecture -func serverForCpu() string { - if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { - return "metal" - } - variant := gpu.GetCPUCapability() - availableServers := getAvailableServers() - if variant != gpu.CPUCapabilityNone { - for cmp := range availableServers { - if cmp == "cpu_"+variant.String() { - return cmp - } - } - } - return "cpu" -} - -// extract extracts the embedded files to the target directory -func extractFiles(targetDir string, glob string) error { - files, err := fs.Glob(libEmbed, glob) - if err != nil || len(files) == 0 { - return errPayloadMissing - } - - if err := os.MkdirAll(targetDir, 0o755); err != nil { - return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err) - } - - g := new(errgroup.Group) - - // build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE - for _, file := range files { - filename := file - - variant := filepath.Base(filepath.Dir(filepath.Dir(filename))) - - slog.Debug("extracting", "variant", variant, "file", filename) - - g.Go(func() error { - srcf, err := libEmbed.Open(filename) - if err != nil { - return err - } - defer srcf.Close() - - src := io.Reader(srcf) - if strings.HasSuffix(filename, ".gz") { - src, err = gzip.NewReader(src) - if err != nil { - return fmt.Errorf("decompress payload %s: %v", filename, err) - } - filename = strings.TrimSuffix(filename, ".gz") - } - - variantDir := filepath.Join(targetDir, variant) - if err := os.MkdirAll(variantDir, 0o755); err != nil { - return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err) - } - - base := filepath.Base(filename) - destFilename := filepath.Join(variantDir, base) - - _, err = os.Stat(destFilename) - switch { - case errors.Is(err, os.ErrNotExist): - destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - return fmt.Errorf("write payload %s: %v", filename, err) - } - defer destFile.Close() - if _, err := io.Copy(destFile, src); err != nil { - return fmt.Errorf("copy payload %s: %v", filename, err) - } - case err != nil: - return fmt.Errorf("stat payload %s: %v", filename, err) - } - return nil - }) - } - - err = g.Wait() - if err != nil { - // If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted - gpu.Cleanup() - return err - } - return nil -} diff --git a/llm/server.go b/llm/server.go index 28eb8d6f..6c504f14 100644 --- a/llm/server.go +++ b/llm/server.go @@ -24,9 +24,11 @@ import ( "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/build" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" + "github.com/ollama/ollama/runners" ) type LlamaServer interface { @@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr gpus = gpu.GetCPUInfo() } if len(gpus) == 1 && gpus[0].Library == "cpu" { - cpuRunner = serverForCpu() + cpuRunner = runners.ServerForCpu() estimate = EstimateGPULayers(gpus, ggml, projectors, opts) } else { estimate = EstimateGPULayers(gpus, ggml, projectors, opts) @@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr opts.NumGPU = 0 case gpus[0].Library != "metal" && estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit - cpuRunner = serverForCpu() + cpuRunner = runners.ServerForCpu() gpus = gpu.GetCPUInfo() case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": opts.NumGPU = estimate.Layers @@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") } - availableServers := getAvailableServers() + rDir, err := runners.Refresh(build.EmbedFS) + if err != nil { + return nil, err + } + + availableServers := runners.GetAvailableServers(rDir) if len(availableServers) == 0 { - if runtime.GOOS != "windows" { - slog.Warn("llama server binary disappeared, reinitializing payloads") - err = Init() - if err != nil { - slog.Warn("failed to reinitialize payloads", "error", err) - return nil, err - } - availableServers = getAvailableServers() - } else { - return nil, finalErr - } + return nil, finalErr } var servers []string if cpuRunner != "" { servers = []string{cpuRunner} } else { - servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant + servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant } demandLib := envconfig.LLMLibrary() if demandLib != "" { @@ -274,7 +271,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--tensor-split", estimate.TensorSplit) } - for i := range len(servers) { + for i := range servers { dir := availableServers[servers[i]] if dir == "" { // Shouldn't happen @@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr _, err := os.Stat(server) if errors.Is(err, os.ErrNotExist) { slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err) - err = Init() + _, err = runners.Refresh(build.EmbedFS) if err != nil { slog.Warn("failed to reinitialize payloads", "error", err) return nil, err diff --git a/openai/openai.go b/openai/openai.go index a4499682..2bf9b9f9 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -452,7 +452,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { } if r.Temperature != nil { - options["temperature"] = *r.Temperature * 2.0 + options["temperature"] = *r.Temperature } else { options["temperature"] = 1.0 } @@ -462,11 +462,11 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { } if r.FrequencyPenalty != nil { - options["frequency_penalty"] = *r.FrequencyPenalty * 2.0 + options["frequency_penalty"] = *r.FrequencyPenalty } if r.PresencePenalty != nil { - options["presence_penalty"] = *r.PresencePenalty * 2.0 + options["presence_penalty"] = *r.PresencePenalty } if r.TopP != nil { @@ -513,7 +513,7 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { } if r.Temperature != nil { - options["temperature"] = *r.Temperature * 2.0 + options["temperature"] = *r.Temperature } else { options["temperature"] = 1.0 } @@ -522,9 +522,9 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { options["seed"] = *r.Seed } - options["frequency_penalty"] = r.FrequencyPenalty * 2.0 + options["frequency_penalty"] = r.FrequencyPenalty - options["presence_penalty"] = r.PresencePenalty * 2.0 + options["presence_penalty"] = r.PresencePenalty if r.TopP != 0.0 { options["top_p"] = r.TopP diff --git a/openai/openai_test.go b/openai/openai_test.go index b34f73c5..eabf5b66 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -102,9 +102,9 @@ func TestChatMiddleware(t *testing.T) { "num_predict": 999.0, // float because JSON doesn't distinguish between float and int "seed": 123.0, "stop": []any{"\n", "stop"}, - "temperature": 6.0, - "frequency_penalty": 8.0, - "presence_penalty": 10.0, + "temperature": 3.0, + "frequency_penalty": 4.0, + "presence_penalty": 5.0, "top_p": 6.0, }, Format: "json", @@ -275,7 +275,7 @@ func TestCompletionsMiddleware(t *testing.T) { Options: map[string]any{ "frequency_penalty": 0.0, "presence_penalty": 0.0, - "temperature": 1.6, + "temperature": 0.8, "top_p": 1.0, "stop": []any{"\n", "stop"}, }, diff --git a/runners/common.go b/runners/common.go new file mode 100644 index 00000000..681c397b --- /dev/null +++ b/runners/common.go @@ -0,0 +1,384 @@ +package runners + +import ( + "compress/gzip" + "errors" + "fmt" + "io" + "io/fs" + "log/slog" + "os" + "path/filepath" + "runtime" + "slices" + "strconv" + "strings" + "sync" + "syscall" + + "golang.org/x/sync/errgroup" + + "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/gpu" +) + +const ( + binGlob = "*/*/*/*" +) + +var ( + lock sync.Mutex + runnersDir = "" +) + +// Return the location where runners are stored +// If runners are payloads, this will either extract them +// or refresh them if any have disappeared due to tmp cleaners +func Refresh(payloadFS fs.FS) (string, error) { + lock.Lock() + defer lock.Unlock() + var err error + + // Wire up extra logging on our first load + if runnersDir == "" { + defer func() { + var runners []string + for v := range GetAvailableServers(runnersDir) { + runners = append(runners, v) + } + slog.Info("Dynamic LLM libraries", "runners", runners) + slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY") + }() + } + + if hasPayloads(payloadFS) { + if runnersDir == "" { + runnersDir, err = extractRunners(payloadFS) + } else { + err = refreshRunners(payloadFS, runnersDir) + } + } else if runnersDir == "" { + runnersDir, err = locateRunners() + } + + return runnersDir, err +} + +func Cleanup(payloadFS fs.FS) { + lock.Lock() + defer lock.Unlock() + if hasPayloads(payloadFS) && runnersDir != "" { + // We want to fully clean up the tmpdir parent of the payloads dir + tmpDir := filepath.Clean(filepath.Join(runnersDir, "..")) + slog.Debug("cleaning up", "dir", tmpDir) + err := os.RemoveAll(tmpDir) + if err != nil { + slog.Warn("failed to clean up", "dir", tmpDir, "err", err) + } + } +} + +func locateRunners() (string, error) { + exe, err := os.Executable() + if err != nil { + return "", err + } + + cwd, err := os.Getwd() + if err != nil { + return "", err + } + + var paths []string + for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} { + paths = append(paths, + root, + filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), + filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), + ) + } + + // Try a few variations to improve developer experience when building from source in the local tree + for _, path := range paths { + candidate := filepath.Join(path, "lib", "ollama", "runners") + if _, err := os.Stat(candidate); err == nil { + return candidate, nil + } + } + return "", fmt.Errorf("unable to locate runners in any search path %v", paths) +} + +// Return true if we're carying nested payloads for the runners +func hasPayloads(payloadFS fs.FS) bool { + files, err := fs.Glob(payloadFS, binGlob) + if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) { + return false + } + return true +} + +func extractRunners(payloadFS fs.FS) (string, error) { + cleanupTmpDirs() + tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama") + if err != nil { + return "", fmt.Errorf("failed to generate tmp dir: %w", err) + } + // Track our pid so we can clean up orphaned tmpdirs + n := filepath.Join(tmpDir, "ollama.pid") + if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { + slog.Warn("failed to write pid file", "file", n, "error", err) + } + // We create a distinct subdirectory for payloads within the tmpdir + // This will typically look like /tmp/ollama3208993108/runners on linux + rDir := filepath.Join(tmpDir, "runners") + + slog.Info("extracting embedded files", "dir", rDir) + return rDir, refreshRunners(payloadFS, rDir) +} + +func refreshRunners(payloadFS fs.FS, rDir string) error { + // extract or refresh server libraries + err := extractFiles(payloadFS, rDir, binGlob) + if err != nil { + return fmt.Errorf("extract binaries: %v", err) + } + return nil +} + +// extract extracts the embedded files to the target directory +func extractFiles(payloadFS fs.FS, targetDir string, glob string) error { + files, err := fs.Glob(payloadFS, glob) + if err != nil || len(files) == 0 { + // Should not happen + return fmt.Errorf("extractFiles called without payload present") + } + + if err := os.MkdirAll(targetDir, 0o755); err != nil { + return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err) + } + + g := new(errgroup.Group) + + // $OS/$GOARCH/$RUNNER/$FILE + for _, file := range files { + filename := file + + runner := filepath.Base(filepath.Dir(filename)) + + slog.Debug("extracting", "runner", runner, "payload", filename) + + g.Go(func() error { + srcf, err := payloadFS.Open(filename) + if err != nil { + return err + } + defer srcf.Close() + + src := io.Reader(srcf) + if strings.HasSuffix(filename, ".gz") { + src, err = gzip.NewReader(src) + if err != nil { + return fmt.Errorf("decompress payload %s: %v", filename, err) + } + filename = strings.TrimSuffix(filename, ".gz") + } + + runnerDir := filepath.Join(targetDir, runner) + if err := os.MkdirAll(runnerDir, 0o755); err != nil { + return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err) + } + + base := filepath.Base(filename) + destFilename := filepath.Join(runnerDir, base) + + _, err = os.Stat(destFilename) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write payload %s: %v", filename, err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, src); err != nil { + return fmt.Errorf("copy payload %s: %v", filename, err) + } + case err != nil: + return fmt.Errorf("stat payload %s: %v", filename, err) + } + return nil + }) + } + + err = g.Wait() + if err != nil { + slog.Error("failed to extract files", "error", err) + // If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted + err := os.RemoveAll(targetDir) + if err != nil { + slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err) + } + return err + } + return nil +} + +// Best effort to clean up prior tmpdirs +func cleanupTmpDirs() { + tmpDir := envconfig.TmpDir() + if tmpDir == "" { + tmpDir = os.TempDir() + } + matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid")) + if err != nil { + return + } + + for _, match := range matches { + raw, err := os.ReadFile(match) + if errors.Is(err, os.ErrNotExist) { + slog.Debug("not a ollama runtime directory, skipping", "path", match) + continue + } else if err != nil { + slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err) + continue + } + + pid, err := strconv.Atoi(string(raw)) + if err != nil { + slog.Warn("invalid pid, skipping", "path", match, "error", err) + continue + } + + p, err := os.FindProcess(pid) + if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) { + slog.Warn("process still running, skipping", "pid", pid, "path", match) + continue + } + + if err := os.Remove(match); err != nil { + slog.Warn("could not cleanup stale pidfile", "path", match, "error", err) + } + + runners := filepath.Join(filepath.Dir(match), "runners") + if err := os.RemoveAll(runners); err != nil { + slog.Warn("could not cleanup stale runners", "path", runners, "error", err) + } + + if err := os.Remove(filepath.Dir(match)); err != nil { + slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err) + } + } +} + +// directory names are the name of the runner and may contain an optional +// variant prefixed with '_' as the separator. For example, "cuda_v11" and +// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the +// lowest common denominator +func GetAvailableServers(payloadsDir string) map[string]string { + if payloadsDir == "" { + slog.Error("empty runner dir") + return nil + } + + // glob payloadsDir for files that start with ollama_ + pattern := filepath.Join(payloadsDir, "*", "ollama_*") + + files, err := filepath.Glob(pattern) + if err != nil { + slog.Debug("could not glob", "pattern", pattern, "error", err) + return nil + } + + servers := make(map[string]string) + for _, file := range files { + slog.Debug("availableServers : found", "file", file) + servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file) + } + + return servers +} + +// serversForGpu returns a list of compatible servers give the provided GPU +// info, ordered by performance. assumes Init() has been called +// TODO - switch to metadata based mapping +func ServersForGpu(info gpu.GpuInfo) []string { + // glob workDir for files that start with ollama_ + availableServers := GetAvailableServers(runnersDir) + requested := info.Library + if info.Variant != gpu.CPUCapabilityNone.String() { + requested += "_" + info.Variant + } + + servers := []string{} + + // exact match first + for a := range availableServers { + if a == requested { + servers = []string{a} + + if a == "metal" { + return servers + } + + break + } + } + + alt := []string{} + + // Then for GPUs load alternates and sort the list for consistent load ordering + if info.Library != "cpu" { + for a := range availableServers { + if info.Library == strings.Split(a, "_")[0] && a != requested { + alt = append(alt, a) + } + } + + slices.Sort(alt) + servers = append(servers, alt...) + } + + if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { + // Load up the best CPU variant if not primary requested + if info.Library != "cpu" { + variant := gpu.GetCPUCapability() + // If no variant, then we fall back to default + // If we have a variant, try that if we find an exact match + // Attempting to run the wrong CPU instructions will panic the + // process + if variant != gpu.CPUCapabilityNone { + for cmp := range availableServers { + if cmp == "cpu_"+variant.String() { + servers = append(servers, cmp) + break + } + } + } else { + servers = append(servers, "cpu") + } + } + + if len(servers) == 0 { + servers = []string{"cpu"} + } + } + + return servers +} + +// Return the optimal server for this CPU architecture +func ServerForCpu() string { + if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { + return "metal" + } + variant := gpu.GetCPUCapability() + availableServers := GetAvailableServers(runnersDir) + if variant != gpu.CPUCapabilityNone { + for cmp := range availableServers { + if cmp == "cpu_"+variant.String() { + return cmp + } + } + } + return "cpu" +} diff --git a/runners/runners_test.go b/runners/runners_test.go new file mode 100644 index 00000000..e6439448 --- /dev/null +++ b/runners/runners_test.go @@ -0,0 +1,50 @@ +package runners + +import ( + "log/slog" + "os" + "path" + "runtime" + "strings" + "testing" + "testing/fstest" +) + +func TestRefreshRunners(t *testing.T) { + slog.SetLogLoggerLevel(slog.LevelDebug) + + payloadFS := fstest.MapFS{ + path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")}, + } + tmpDir, err := os.MkdirTemp("", "testing") + if err != nil { + t.Fatalf("failed to make tmp dir %s", err) + } + t.Setenv("OLLAMA_TMPDIR", tmpDir) + rDir, err := Refresh(payloadFS) + if err != nil { + t.Fatalf("failed to extract to %s %s", tmpDir, err) + } + if !strings.Contains(rDir, tmpDir) { + t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir) + } + + // spot check results + servers := GetAvailableServers(rDir) + if len(servers) < 1 { + t.Fatalf("expected at least 1 server") + } + + // Refresh contents + rDir, err = extractRunners(payloadFS) + if err != nil { + t.Fatalf("failed to extract to %s %s", tmpDir, err) + } + if !strings.Contains(rDir, tmpDir) { + t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir) + } + + cleanupTmpDirs() + + Cleanup(payloadFS) +} diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh index a2f76af2..17ac0b94 100755 --- a/scripts/build_darwin.sh +++ b/scripts/build_darwin.sh @@ -2,8 +2,7 @@ set -e -export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} -export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +. $(dirname $0)/env.sh mkdir -p dist diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index f70624e6..567eb7c7 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -2,76 +2,34 @@ set -eu -export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} -export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" - -# We use 2 different image repositories to handle combining architecture images into multiarch manifest -# (The ROCm image is x86 only and is not a multiarch manifest) -# For developers, you can override the DOCKER_ORG to generate multiarch manifests -# DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh -DOCKER_ORG=${DOCKER_ORG:-"ollama"} -RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"} -FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"} - -BUILD_ARCH=${BUILD_ARCH:-"amd64"} +. $(dirname $0)/env.sh # Set PUSH to a non-empty string to trigger push instead of load PUSH=${PUSH:-""} -# In CI mode, we break things down -OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""} -OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""} - if [ -z "${PUSH}" ] ; then + echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally. set PUSH=1 to push" LOAD_OR_PUSH="--load" else - echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}" + echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION" LOAD_OR_PUSH="--push" fi -if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then - for TARGETARCH in ${BUILD_ARCH}; do - docker build \ - ${LOAD_OR_PUSH} \ - --platform=linux/${TARGETARCH} \ - --build-arg=VERSION \ - --build-arg=GOFLAGS \ - -f Dockerfile \ - -t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \ - . - done +docker buildx build \ + ${LOAD_OR_PUSH} \ + --platform=${PLATFORM} \ + ${OLLAMA_COMMON_BUILD_ARGS} \ + -f Dockerfile \ + -t ${FINAL_IMAGE_REPO}:$VERSION \ + . - if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then - docker build \ - ${LOAD_OR_PUSH} \ - --platform=linux/amd64 \ - --build-arg=VERSION \ - --build-arg=GOFLAGS \ - --target runtime-rocm \ - -f Dockerfile \ - -t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \ - . - fi -fi - -if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then - if [ -n "${PUSH}" ]; then - docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \ - ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \ - ${RELEASE_IMAGE_REPO}:$VERSION-arm64 - docker manifest push ${FINAL_IMAGE_REPO}:$VERSION - - # For symmetry, tag/push the rocm image - if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then - echo "Tagging and pushing rocm image" - docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm - docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm - docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm - fi - else - echo "Skipping manifest generation when not pushing images are available locally as " - echo " ${RELEASE_IMAGE_REPO}:$VERSION-amd64" - echo " ${RELEASE_IMAGE_REPO}:$VERSION-arm64" - echo " ${RELEASE_IMAGE_REPO}:$VERSION-rocm" - fi -fi +if echo $PLATFORM | grep "amd64" > /dev/null; then + docker buildx build \ + ${LOAD_OR_PUSH} \ + --platform=linux/amd64 \ + ${OLLAMA_COMMON_BUILD_ARGS} \ + --target runtime-rocm \ + -f Dockerfile \ + -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \ + . +fi \ No newline at end of file diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 290b99c9..dc1d7991 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -1,38 +1,29 @@ #!/bin/sh +# +# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder +# +# docker context create amd64 --docker host=ssh://mybuildhost +# docker buildx create --name mybuilder amd64 --platform linux/amd64 +# docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64 +# docker buildx use mybuilder + set -eu -export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} -export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" -GZIP=$(which pigz 2>/dev/null || echo "gzip") +. $(dirname $0)/env.sh -#BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} -BUILD_ARCH=${BUILD_ARCH:-"amd64"} -export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""} mkdir -p dist -for TARGETARCH in ${BUILD_ARCH}; do - docker build \ - --platform=linux/$TARGETARCH \ - --build-arg=GOFLAGS \ - --build-arg=CGO_CFLAGS \ - --build-arg=OLLAMA_CUSTOM_CPU_DEFS \ - --build-arg=AMDGPU_TARGETS \ - --target build-$TARGETARCH \ +docker buildx build \ + --output type=local,dest=./dist/ \ + --platform=${PLATFORM} \ + ${OLLAMA_COMMON_BUILD_ARGS} \ + --target dist \ -f Dockerfile \ - -t builder:$TARGETARCH \ . - docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH - rm -rf ./dist/linux-$TARGETARCH - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist - if echo ${TARGETARCH} | grep "amd64" > /dev/null; then - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist - fi - docker rm builder-$TARGETARCH - echo "Compressing final linux bundle..." - rm -f ./dist/ollama-linux-$TARGETARCH.tgz - (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) - if [ -d dist/linux-$TARGETARCH-rocm ]; then - (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz ) - fi -done + +# buildx behavior changes for single vs. multiplatform +if echo $PLATFORM | grep "," > /dev/null ; then + mv -f ./dist/linux_*64/ollama* ./dist/ + rmdir ./dist/linux_*64 +fi \ No newline at end of file diff --git a/scripts/env.sh b/scripts/env.sh new file mode 100644 index 00000000..d3ca05d7 --- /dev/null +++ b/scripts/env.sh @@ -0,0 +1,14 @@ +# Common environment setup across build*.sh scripts + +export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} +export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +# TODO - consider `docker buildx ls --format=json` to autodiscover platform capability +PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"} +DOCKER_ORG=${DOCKER_ORG:-"ollama"} +RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"} +FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"} +OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS" + +echo "Building Ollama" +echo "VERSION=$VERSION" +echo "PLATFORM=$PLATFORM" \ No newline at end of file diff --git a/server/routes.go b/server/routes.go index 5e9f51e1..6bd3a93f 100644 --- a/server/routes.go +++ b/server/routes.go @@ -26,11 +26,13 @@ import ( "golang.org/x/sync/errgroup" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/build" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" + "github.com/ollama/ollama/runners" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" @@ -117,6 +119,32 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } + // expire the runner + if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 { + model, err := GetModel(req.Model) + if err != nil { + switch { + case os.IsNotExist(err): + c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)}) + case err.Error() == "invalid model name": + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + default: + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + } + return + } + s.sched.expireRunner(model) + + c.JSON(http.StatusOK, api.GenerateResponse{ + Model: req.Model, + CreatedAt: time.Now().UTC(), + Response: "", + Done: true, + DoneReason: "unload", + }) + return + } + if req.Format != "" && req.Format != "json" { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""}) return @@ -1190,12 +1218,12 @@ func Serve(ln net.Listener) error { srvr.Close() schedDone() sched.unloadAllRunners() - gpu.Cleanup() + runners.Cleanup(build.EmbedFS) done() }() - if err := llm.Init(); err != nil { - return fmt.Errorf("unable to initialize llm library %w", err) + if _, err := runners.Refresh(build.EmbedFS); err != nil { + return fmt.Errorf("unable to initialize llm runners %w", err) } s.sched.Run(schedCtx) @@ -1322,6 +1350,32 @@ func (s *Server) ChatHandler(c *gin.Context) { return } + // expire the runner + if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 { + model, err := GetModel(req.Model) + if err != nil { + switch { + case os.IsNotExist(err): + c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)}) + case err.Error() == "invalid model name": + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + default: + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + } + return + } + s.sched.expireRunner(model) + + c.JSON(http.StatusOK, api.ChatResponse{ + Model: req.Model, + CreatedAt: time.Now().UTC(), + Message: api.Message{Role: "assistant"}, + Done: true, + DoneReason: "unload", + }) + return + } + caps := []Capability{CapabilityCompletion} if len(req.Tools) > 0 { caps = append(caps, CapabilityTools) diff --git a/server/sched.go b/server/sched.go index 58071bf0..3c8656ad 100644 --- a/server/sched.go +++ b/server/sched.go @@ -360,7 +360,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) { slog.Debug("runner expired event received", "modelPath", runner.modelPath) runner.refMu.Lock() if runner.refCount > 0 { - // Shouldn't happen, but safeguard to ensure no leaked runners slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount) go func(runner *runnerRef) { // We can't unload yet, but want to as soon as the current request completes @@ -802,6 +801,25 @@ func (s *Scheduler) unloadAllRunners() { } } +func (s *Scheduler) expireRunner(model *Model) { + s.loadedMu.Lock() + defer s.loadedMu.Unlock() + runner, ok := s.loaded[model.ModelPath] + if ok { + runner.refMu.Lock() + runner.expiresAt = time.Now() + if runner.expireTimer != nil { + runner.expireTimer.Stop() + runner.expireTimer = nil + } + runner.sessionDuration = 0 + if runner.refCount <= 0 { + s.expiredCh <- runner + } + runner.refMu.Unlock() + } +} + // If other runners are loaded, make sure the pending request will fit in system memory // If not, pick a runner to unload, else return nil and the request can be loaded func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef { diff --git a/server/sched_test.go b/server/sched_test.go index fb049574..be32065a 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -406,6 +406,52 @@ func TestGetRunner(t *testing.T) { b.ctxDone() } +func TestExpireRunner(t *testing.T) { + ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) + defer done() + s := InitScheduler(ctx) + req := &LlmRequest{ + ctx: ctx, + model: &Model{ModelPath: "foo"}, + opts: api.DefaultOptions(), + successCh: make(chan *runnerRef, 1), + errCh: make(chan error, 1), + sessionDuration: &api.Duration{Duration: 2 * time.Minute}, + } + + var ggml *llm.GGML + gpus := gpu.GpuInfoList{} + server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} + s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + return server, nil + } + s.load(req, ggml, gpus, 0) + + select { + case err := <-req.errCh: + if err != nil { + t.Fatalf("expected no errors when loading, got '%s'", err.Error()) + } + case resp := <-req.successCh: + s.loadedMu.Lock() + if resp.refCount != uint(1) || len(s.loaded) != 1 { + t.Fatalf("expected a model to be loaded") + } + s.loadedMu.Unlock() + } + + s.expireRunner(&Model{ModelPath: "foo"}) + + s.finishedReqCh <- req + s.processCompleted(ctx) + + s.loadedMu.Lock() + if len(s.loaded) != 0 { + t.Fatalf("expected model to be unloaded") + } + s.loadedMu.Unlock() +} + // TODO - add one scenario that triggers the bogus finished event with positive ref count func TestPrematureExpired(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)