Update README.md

Updated version tag version number, use default 0.0.0
Changed to amd64 only build and also added docker-compose.yml
2024-09-21 14:26:19 +01:00 · 2024-09-21 14:21:08 +01:00 · 2024-09-21 13:47:19 +01:00 · 2024-09-20 20:14:03 +01:00 · 2024-09-20 20:13:05 +01:00 · 2024-09-20 20:04:23 +01:00
17 changed files with 746 additions and 89 deletions
--- a/158
+++ b/158
@ -4,7 +4,7 @@ ARG CUDA_VERSION_11=11.3.1
 ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
-ARG ROCM_VERSION=6.1.2
+ARG ROCM_VERSION=5.7.1

 # Copy the minimal context we need to run the generate scripts
 FROM scratch AS llm-code
@ -47,39 +47,39 @@ RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
    bash gen_linux.sh

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V11_ARCHITECTURES
-ENV GOARCH=arm64
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
-    CUDA_VARIANT="_v11" \
-    bash gen_linux.sh
+#FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
+#ARG CMAKE_VERSION
+#COPY ./scripts/rh_linux_deps.sh /
+#RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+#ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+#COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+#WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+#ARG CGO_CFLAGS
+#ARG CUDA_V11_ARCHITECTURES
+#ENV GOARCH=arm64
+#RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
+#    OLLAMA_SKIP_CPU_GENERATE=1 \
+#    CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \
+#    CUDA_VARIANT="_v11" \
+#    bash gen_linux.sh

-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
-ARG CMAKE_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
-ARG CGO_CFLAGS
-ARG CUDA_V12_ARCHITECTURES
-ENV GOARCH=arm64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 \
-    OLLAMA_SKIP_CPU_GENERATE=1 \
-    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
-    CUDA_VARIANT="_v12" \
-    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
-    bash gen_linux.sh
+#FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
+#ARG CMAKE_VERSION
+#COPY ./scripts/rh_linux_deps.sh /
+#RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+#ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+#COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+#WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+#ARG CGO_CFLAGS
+#ARG CUDA_V12_ARCHITECTURES
+#ENV GOARCH=arm64
+#RUN --mount=type=cache,target=/root/.ccache \
+#    OLLAMA_SKIP_STATIC_GENERATE=1 \
+#    OLLAMA_SKIP_CPU_GENERATE=1 \
+#    CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \
+#    CUDA_VARIANT="_v12" \
+#    OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
+#    bash gen_linux.sh


 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64
@ -123,24 +123,24 @@ FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
 RUN --mount=type=cache,target=/root/.ccache \
    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh

-FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-COPY --from=llm-code / /go/src/github.com/ollama/ollama/
-ARG OLLAMA_CUSTOM_CPU_DEFS
-ARG CGO_CFLAGS
-ENV GOARCH=arm64
-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+#FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64
+#ARG CMAKE_VERSION
+#ARG GOLANG_VERSION
+#COPY ./scripts/rh_linux_deps.sh /
+#RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+#ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH
+#COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+#ARG OLLAMA_CUSTOM_CPU_DEFS
+#ARG CGO_CFLAGS
+#ENV GOARCH=arm64
+#WORKDIR /go/src/github.com/ollama/ollama/llm/generate

-FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
-FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
-RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
+#FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
+#RUN --mount=type=cache,target=/root/.ccache \
+#    OLLAMA_CPU_TARGET="static" bash gen_linux.sh
+#FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
+#RUN --mount=type=cache,target=/root/.ccache \
+#    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh


 # Intermediate stages used for ./scripts/build_linux.sh
@ -166,28 +166,28 @@ RUN cd dist/linux-$GOARCH && \
 RUN cd dist/linux-$GOARCH-rocm && \
    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz

-FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
-ENV CGO_ENABLED=1
-ARG GOLANG_VERSION
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+#FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
+#ENV CGO_ENABLED=1
+#ARG GOLANG_VERSION
+#WORKDIR /go/src/github.com/ollama/ollama
+#COPY . .
+#COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
+#COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+#COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
+#COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+#COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
+#ARG GOFLAGS
+#ARG CGO_CFLAGS
+#RUN --mount=type=cache,target=/root/.ccache \
+#    go build -trimpath -o dist/linux-arm64/bin/ollama .
+#RUN cd dist/linux-$GOARCH && \
+#    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz

-FROM --platform=linux/amd64 scratch AS dist-amd64
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM --platform=linux/arm64 scratch AS dist-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
-FROM dist-$TARGETARCH as dist
+#FROM --platform=linux/amd64 scratch AS dist-amd64
+#COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+#FROM --platform=linux/arm64 scratch AS dist-arm64
+#COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+#FROM dist-$TARGETARCH as dist


 # Optimized container images do not cary nested payloads
@ -218,14 +218,14 @@ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/

-FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-RUN apt-get update && \
-    apt-get install -y ca-certificates && \
-    apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+#FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
+#RUN apt-get update && \
+#    apt-get install -y ca-certificates && \
+#    apt-get clean && rm -rf /var/lib/apt/lists/*
+#COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+#COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+#COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+#COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/

 # ROCm libraries larger so we keep it distinct from the CPU/CUDA image
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
--- a/README.md
+++ b/README.md
@ -8,6 +8,195 @@

 Get up and running with large language models.

+### Linux with rx580 Radeon GPU
+
+This branch is has had changes for building on amd64 architecture(arm has been commented out in the Docker file- just for build performance) so as Ollama works with 
+rx590 Redeon GPU.
+
+It should be considered experimental.
+
+I've only been testing using the docker build, using ubuntu 22.04.04 LTS
+
+Make sure docker is installed and running ok, and the docker host machine has rocm 5.7.1 libraries installed.
+
+Follow this documentation for rocm installation, just substitute the 5.7.0 references to 5.7.1 in the documentation.
+--https://rocm.docs.amd.com/en/docs-5.7.0/deploy/linux/os-native/install.html
+
+To build
+
+```
+export VERSION=0.0.0
+
+./scripts/build_docker.sh
+
+```
+After that has compiled successfully 
+
+Then to start a container using the image 
+```
+
+docker run -e HIP_PATH=/opt/rocm/lib/ -e LD_LIBRARY_PATH=/opt/rocm/lib --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama_gpu ollama/release:0.3.10-rc1-2-g56318fb-dirty-rocm
+
+```  
+But make sure to change the tag "0.3.10-rc1-2-g56318fb-dirty-rocm" to what gets built from your build process. This is shown in the last phase of the build where it exports the images.
+
+The debug info that gets output should look something like:
+```
+docker run -e HIP_PATH=/opt/rocm/lib/ -e LD_LIBRARY_PATH=/opt/rocm/lib --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 11434:11434 --name ollama_gpu_2 ollama/release:3449201-rocm                    
+2024/09/15 14:56:41 routes.go:1125: INFO server config env="map[CUDA_VISIBLE_DEVICES: GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://0.0.0.0:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_RUNNERS_DIR: OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES:]"
+time=2024-09-15T14:56:41.304Z level=INFO source=images.go:753 msg="total blobs: 18"
+time=2024-09-15T14:56:41.307Z level=INFO source=images.go:760 msg="total unused blobs removed: 0"
+time=2024-09-15T14:56:41.307Z level=INFO source=routes.go:1172 msg="Listening on [::]:11434 (version 3449201)"
+time=2024-09-15T14:56:41.308Z level=INFO source=payload.go:30 msg="extracting embedded files" dir=/tmp/ollama2706594826/runners
+time=2024-09-15T14:56:51.283Z level=INFO source=payload.go:44 msg="Dynamic LLM libraries [cpu_avx cpu_avx2 cuda_v11 cuda_v12 rocm_v0 cpu]"
+time=2024-09-15T14:56:51.283Z level=INFO source=gpu.go:200 msg="looking for compatible GPUs"
+time=2024-09-15T14:56:51.296Z level=WARN source=amd_linux.go:59 msg="ollama recommends running the https://www.amd.com/en/support/linux-drivers" error="amdgpu version file missing: /sys/module/amdgpu/version stat /sys/module/amdgpu/version: no such file or directory"
+time=2024-09-15T14:56:51.308Z level=INFO source=amd_linux.go:345 msg="amdgpu is supported" gpu=0 gpu_type=gfx803
+time=2024-09-15T14:56:51.308Z level=INFO source=types.go:107 msg="inference compute" id=0 library=rocm variant="" compute=gfx803 driver=0.0 name=1002:67df total="8.0 GiB" available="8.0 GiB"
+[GIN] 2024/09/15 - 14:57:20 | 200 |       46.11µs |       127.0.0.1 | HEAD     "/"
+[GIN] 2024/09/15 - 14:57:20 | 200 |   24.189203ms |       127.0.0.1 | POST     "/api/show"
+
+```
+
+
+Once running, in another terminal window, test it out:
+
+```
+docker exec -it ollama_gpu ollama run llama3.1
+
+```
+
+Checkout the debug log again, should look something like:
+
+```
+time=2024-09-15T14:57:20.500Z level=INFO source=sched.go:715 msg="new model will fit in available VRAM in single GPU, loading" model=/root/.ollama/models/blobs/sha256-8eeb52dfb3bb9aefdf9d1ef24b3bdbcfbe82238798c4b918278320b6fcef18fe gpu=0 parallel=4 available=8584495104 required="6.2 GiB"
+time=2024-09-15T14:57:20.500Z level=INFO source=server.go:101 msg="system memory" total="15.6 GiB" free="14.6 GiB" free_swap="46.5 GiB"
+time=2024-09-15T14:57:20.500Z level=INFO source=memory.go:326 msg="offload to rocm" layers.requested=-1 layers.model=33 layers.offload=33 layers.split="" memory.available="[8.0 GiB]" memory.gpu_overhead="0 B" memory.required.full="6.2 GiB" memory.required.partial="6.2 GiB" memory.required.kv="1.0 GiB" memory.required.allocations="[6.2 GiB]" memory.weights.total="4.7 GiB" memory.weights.repeating="4.3 GiB" memory.weights.nonrepeating="411.0 MiB" memory.graph.full="560.0 MiB" memory.graph.partial="677.5 MiB"
+time=2024-09-15T14:57:20.503Z level=INFO source=server.go:391 msg="starting llama server" cmd="/tmp/ollama2706594826/runners/rocm_v0/ollama_llama_server --model /root/.ollama/models/blobs/sha256-8eeb52dfb3bb9aefdf9d1ef24b3bdbcfbe82238798c4b918278320b6fcef18fe --ctx-size 8192 --batch-size 512 --embedding --log-disable --n-gpu-layers 33 --parallel 4 --port 43843"
+time=2024-09-15T14:57:20.503Z level=INFO source=sched.go:450 msg="loaded runners" count=1
+time=2024-09-15T14:57:20.503Z level=INFO source=server.go:590 msg="waiting for llama runner to start responding"
+time=2024-09-15T14:57:20.503Z level=INFO source=server.go:624 msg="waiting for server to become available" status="llm server error"
+INFO [main] build info | build=3661 commit="8962422b" tid="126494289312832" timestamp=1726412240
+INFO [main] system info | n_threads=4 n_threads_batch=4 system_info="AVX = 1 | AVX_VNNI = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 0 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | " tid="126494289312832" timestamp=1726412240 total_threads=8
+INFO [main] HTTP server listening | hostname="127.0.0.1" n_threads_http="7" port="43843" tid="126494289312832" timestamp=1726412240
+llama_model_loader: loaded meta data with 29 key-value pairs and 292 tensors from /root/.ollama/models/blobs/sha256-8eeb52dfb3bb9aefdf9d1ef24b3bdbcfbe82238798c4b918278320b6fcef18fe (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = Instruct
+llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
+llama_model_loader: - kv   5:                         general.size_label str              = 8B
+llama_model_loader: - kv   6:                            general.license str              = llama3.1
+llama_model_loader: - kv   7:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
+llama_model_loader: - kv   8:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
+llama_model_loader: - kv   9:                          llama.block_count u32              = 32
+llama_model_loader: - kv  10:                       llama.context_length u32              = 131072
+llama_model_loader: - kv  11:                     llama.embedding_length u32              = 4096
+llama_model_loader: - kv  12:                  llama.feed_forward_length u32              = 14336
+llama_model_loader: - kv  13:                 llama.attention.head_count u32              = 32
+llama_model_loader: - kv  14:              llama.attention.head_count_kv u32              = 8
+llama_model_loader: - kv  15:                       llama.rope.freq_base f32              = 500000.000000
+llama_model_loader: - kv  16:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  17:                          general.file_type u32              = 2
+llama_model_loader: - kv  18:                           llama.vocab_size u32              = 128256
+llama_model_loader: - kv  19:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv  20:                       tokenizer.ggml.model str              = gpt2
+llama_model_loader: - kv  21:                         tokenizer.ggml.pre str              = llama-bpe
+llama_model_loader: - kv  22:                      tokenizer.ggml.tokens arr[str,128256]  = ["!", "\"", "#", "$", "%", "&", "'", ...
+llama_model_loader: - kv  23:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
+time=2024-09-15T14:57:21.006Z level=INFO source=server.go:624 msg="waiting for server to become available" status="llm server loading model"
+llama_model_loader: - kv  24:                      tokenizer.ggml.merges arr[str,280147]  = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
+llama_model_loader: - kv  25:                tokenizer.ggml.bos_token_id u32              = 128000
+llama_model_loader: - kv  26:                tokenizer.ggml.eos_token_id u32              = 128009
+llama_model_loader: - kv  27:                    tokenizer.chat_template str              = {{- bos_token }}\n{%- if custom_tools ...
+llama_model_loader: - kv  28:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   66 tensors
+llama_model_loader: - type q4_0:  225 tensors
+llama_model_loader: - type q6_K:    1 tensors
+llm_load_vocab: special tokens cache size = 256
+llm_load_vocab: token to piece cache size = 0.7999 MB
+llm_load_print_meta: format           = GGUF V3 (latest)
+llm_load_print_meta: arch             = llama
+llm_load_print_meta: vocab type       = BPE
+llm_load_print_meta: n_vocab          = 128256
+llm_load_print_meta: n_merges         = 280147
+llm_load_print_meta: vocab_only       = 0
+llm_load_print_meta: n_ctx_train      = 131072
+llm_load_print_meta: n_embd           = 4096
+llm_load_print_meta: n_layer          = 32
+llm_load_print_meta: n_head           = 32
+llm_load_print_meta: n_head_kv        = 8
+llm_load_print_meta: n_rot            = 128
+llm_load_print_meta: n_swa            = 0
+llm_load_print_meta: n_embd_head_k    = 128
+llm_load_print_meta: n_embd_head_v    = 128
+llm_load_print_meta: n_gqa            = 4
+llm_load_print_meta: n_embd_k_gqa     = 1024
+llm_load_print_meta: n_embd_v_gqa     = 1024
+llm_load_print_meta: f_norm_eps       = 0.0e+00
+llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+llm_load_print_meta: f_logit_scale    = 0.0e+00
+llm_load_print_meta: n_ff             = 14336
+llm_load_print_meta: n_expert         = 0
+llm_load_print_meta: n_expert_used    = 0
+llm_load_print_meta: causal attn      = 1
+llm_load_print_meta: pooling type     = 0
+llm_load_print_meta: rope type        = 0
+llm_load_print_meta: rope scaling     = linear
+llm_load_print_meta: freq_base_train  = 500000.0
+llm_load_print_meta: freq_scale_train = 1
+llm_load_print_meta: n_ctx_orig_yarn  = 131072
+llm_load_print_meta: rope_finetuned   = unknown
+llm_load_print_meta: ssm_d_conv       = 0
+llm_load_print_meta: ssm_d_inner      = 0
+llm_load_print_meta: ssm_d_state      = 0
+llm_load_print_meta: ssm_dt_rank      = 0
+llm_load_print_meta: ssm_dt_b_c_rms   = 0
+llm_load_print_meta: model type       = 8B
+llm_load_print_meta: model ftype      = Q4_0
+llm_load_print_meta: model params     = 8.03 B
+llm_load_print_meta: model size       = 4.33 GiB (4.64 BPW) 
+llm_load_print_meta: general.name     = Meta Llama 3.1 8B Instruct
+llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'
+llm_load_print_meta: EOS token        = 128009 '<|eot_id|>'
+llm_load_print_meta: LF token         = 128 'Ä'
+llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'
+llm_load_print_meta: max token length = 256
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 ROCm devices:
+  Device 0: Radeon RX 580 Series, compute capability 8.0, VMM: no
+llm_load_tensors: ggml ctx size =    0.27 MiB
+llm_load_tensors: offloading 32 repeating layers to GPU
+llm_load_tensors: offloading non-repeating layers to GPU
+llm_load_tensors: offloaded 33/33 layers to GPU
+llm_load_tensors:      ROCm0 buffer size =  4156.00 MiB
+llm_load_tensors:        CPU buffer size =   281.81 MiB
+llama_new_context_with_model: n_ctx      = 8192
+llama_new_context_with_model: n_batch    = 512
+llama_new_context_with_model: n_ubatch   = 512
+llama_new_context_with_model: flash_attn = 0
+llama_new_context_with_model: freq_base  = 500000.0
+llama_new_context_with_model: freq_scale = 1
+llama_kv_cache_init:      ROCm0 KV buffer size =  1024.00 MiB
+llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+llama_new_context_with_model:  ROCm_Host  output buffer size =     2.02 MiB
+llama_new_context_with_model:      ROCm0 compute buffer size =   560.00 MiB
+llama_new_context_with_model:  ROCm_Host compute buffer size =    24.01 MiB
+llama_new_context_with_model: graph nodes  = 1030
+llama_new_context_with_model: graph splits = 2
+INFO [main] model loaded | tid="126494289312832" timestamp=1726412253
+time=2024-09-15T14:57:33.297Z level=INFO source=server.go:629 msg="llama runner started in 12.79 seconds"
+[GIN] 2024/09/15 - 14:57:33 | 200 | 12.853561919s |       127.0.0.1 | POST     "/api/chat"
+[GIN] 2024/09/15 - 14:57:43 | 200 |  1.091025241s |       127.0.0.1 | POST     "/api/chat"
+
+``` 
+
+Goog luck!
+
 ### macOS

 [Download](https://ollama.com/download/Ollama-darwin.zip)
@ -197,6 +386,18 @@ ollama show llama3.1
 ollama list
 ```

+### List which models are currently loaded
+
+```
+ollama ps
+```
+
+### Stop a model which is currently running
+
+```
+ollama stop llama3.1
+```
+
 ### Start Ollama

 `ollama serve` is used when you want to start ollama without running the desktop application.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,20 @@
+version: '3.8'
+
+services:
+  ollama_gpu:
+    image: ollama/ollama:0.0.0-rocm
+    container_name: ollama_gpu_3
+    environment:
+      - HIP_PATH=/opt/rocm/lib/
+      - LD_LIBRARY_PATH=/opt/rocm/lib
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    volumes:
+      - ollama:/root/.ollama
+    ports:
+      - "11434:11434"
+    restart: unless-stopped
+
+volumes:
+  ollama:
--- a/docs/api.md
+++ b/docs/api.md
@ -407,6 +407,33 @@ A single JSON object is returned:
 }
 ```

+#### Unload a model
+
+If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "llama3.1",
+  "keep_alive": 0
+}'
+```
+
+##### Response
+
+A single JSON object is returned:
+
+```json
+{
+  "model": "llama3.1",
+  "created_at": "2024-09-12T03:54:03.516566Z",
+  "response": "",
+  "done": true,
+  "done_reason": "unload"
+}
+```
+
 ## Generate a chat completion

 ```shell
@ -736,6 +763,64 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```

+#### Load a model
+
+If the messages array is empty, the model will be loaded into memory.
+
+##### Request
+
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.1",
+  "messages": []
+}'
+```
+
+##### Response
+```json
+{
+  "model": "llama3.1",
+  "created_at":"2024-09-12T21:17:29.110811Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "load",
+  "done": true
+}
+```
+
+#### Unload a model
+
+If the messages array is empty and the `keep_alive` parameter is set to `0`, a model will be unloaded from memory.
+
+##### Request
+
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "llama3.1",
+  "messages": [],
+  "keep_alive": 0
+}'
+```
+
+##### Response
+
+A single JSON object is returned:
+
+```json
+{
+  "model": "llama3.1",
+  "created_at":"2024-09-12T21:33:17.547535Z",
+  "message": {
+    "role": "assistant",
+    "content": ""
+  },
+  "done_reason": "unload",
+  "done": true
+}
+```
+
 ## Create a Model

 ```shell
--- a/docs/faq.md
+++ b/docs/faq.md
@ -237,9 +237,13 @@ ollama run llama3.1 ""

 ## How do I keep a model loaded in memory or make it unload immediately?

-By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you are making numerous requests to the LLM. You may, however, want to free up the memory before the 5 minutes have elapsed or keep the model loaded indefinitely. Use the `keep_alive` parameter with either the `/api/generate` and `/api/chat` API endpoints to control how long the model is left in memory.
+By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you're making numerous requests to the LLM. If you want to immediately unload a model from memory, use the `ollama stop` command:

-The `keep_alive` parameter can be set to:
+```shell
+ollama stop llama3.1
+```
+
+If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
 * a duration string (such as "10m" or "24h")
 * a number in seconds (such as 3600)
 * any negative number which will keep the model loaded in memory (e.g. -1 or "-1m")
@ -255,9 +259,9 @@ To unload the model and free up memory use:
 curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
 ```

-Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
+Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to the section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.

-If you wish to override the `OLLAMA_KEEP_ALIVE` setting, use the `keep_alive` API parameter with the `/api/generate` or `/api/chat` API endpoints.
+The `keep_alive` API parameter with the `/api/generate` and `/api/chat` API endpoints will override the `OLLAMA_KEEP_ALIVE` setting.

 ## How do I manage the maximum number of requests the Ollama server can queue?

--- a/examples/python-grounded-factuality-rag-check/README.md
+++ b/examples/python-grounded-factuality-rag-check/README.md
@ -0,0 +1,93 @@
+# RAG Hallucination Checker using Bespoke-Minicheck
+
+This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.1` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations. 
+
+## Running the Example
+
+1. Ensure `all-minilm` (embedding) `llama3.1` (chat) and `bespoke-minicheck` (check) models installed:
+
+   ```bash
+   ollama pull all-minilm
+   ollama pull llama3.1
+   ollama pull bespoke-minicheck
+   ```
+
+2. Install the dependencies.
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the example:
+
+   ```bash
+   python main.py
+   ```
+
+## Expected Output
+
+```text
+Enter the URL of an article you want to chat with, or press Enter for default example:
+
+Loaded, chunked, and embedded text from https://www.theverge.com/2024/9/12/24242439/openai-o1-model-reasoning-strawberry-chatgpt.
+
+Enter your question or type quit: Who is the CEO of openai?
+
+Retrieved chunks:
+OpenAI is releasing a new model called o1 , the first in a planned series of “ reasoning ” models that have been trained to answer more complex questions , faster than a human can . It ’ s being released alongside o1-mini , a smaller , cheaper version . And yes , if you ’ re steeped in AI rumors : this is , in fact , the extremely hyped Strawberry model . For OpenAI , o1 represents a step toward its broader goal of human-like artificial intelligence .
+
+OpenAI is releasing a new model called o1 , the first in a planned series of “ reasoning ” models that have been trained to answer more complex questions , faster than a human can . It ’ s being released alongside o1-mini , a smaller , cheaper version . And yes , if you ’ re steeped in AI rumors : this is , in fact , the extremely hyped Strawberry model . For OpenAI , o1 represents a step toward its broader goal of human-like artificial intelligence . More practically , it does a better job at writing code and solving multistep problems than previous models . But it ’ s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week .
+
+More practically , it does a better job at writing code and solving multistep problems than previous models . But it ’ s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week . OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens .
+
+OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens . The training behind o1 is fundamentally different from its predecessors , OpenAI ’ s research lead , Jerry Tworek , tells me , though the company is being vague about the exact details . He says o1 “ has been trained using a completely new optimization algorithm and a new training dataset specifically tailored for it. ” Image : OpenAI OpenAI taught previous GPT models to mimic patterns from its training data .
+
+LLM Answer:
+The text does not mention the CEO of OpenAI. It only discusses the release of a new model called o1 and some details about it, but does not provide information on the company's leadership.
+
+LLM Claim: The text does not mention the CEO of OpenAI.
+Is this claim supported by the context according to bespoke-minicheck? Yes
+
+LLM Claim: It only discusses the release of a new model called o1 and some details about it, but does not provide information on the company's leadership.
+Is this claim supported by the context according to bespoke-minicheck? No
+```
+
+The second claim is unsupported since the text mentions the research lead. 
+
+Another tricky example:
+
+```text
+
+Enter your question or type quit: what sets o1 apart from gpt-4o?
+
+Retrieved chunks: 
+OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens . The training behind o1 is fundamentally different from its predecessors , OpenAI ’ s research lead , Jerry Tworek , tells me , though the company is being vague about the exact details . He says o1 “ has been trained using a completely new optimization algorithm and a new training dataset specifically tailored for it. ” Image : OpenAI OpenAI taught previous GPT models to mimic patterns from its training data .
+
+He says OpenAI also tested o1 against a qualifying exam for the International Mathematics Olympiad , and while GPT-4o only correctly solved only 13 percent of problems , o1 scored 83 percent . “ We can ’ t say we solved hallucinations ” In online programming contests known as Codeforces competitions , this new model reached the 89th percentile of participants , and OpenAI claims the next update of this model will perform “ similarly to PhD students on challenging benchmark tasks in physics , chemistry and biology. ” At the same time , o1 is not as capable as GPT-4o in a lot of areas . It doesn ’ t do as well on factual knowledge about the world .
+
+More practically , it does a better job at writing code and solving multistep problems than previous models . But it ’ s also more expensive and slower to use than GPT-4o . OpenAI is calling this release of o1 a “ preview ” to emphasize how nascent it is . ChatGPT Plus and Team users get access to both o1-preview and o1-mini starting today , while Enterprise and Edu users will get access early next week . OpenAI says it plans to bring o1-mini access to all the free users of ChatGPT but hasn ’ t set a release date yet . Developer access to o1 is really expensive : In the API , o1-preview is $ 15 per 1 million input tokens , or chunks of text parsed by the model , and $ 60 per 1 million output tokens . For comparison , GPT-4o costs $ 5 per 1 million input tokens and $ 15 per 1 million output tokens .
+
+“ We can ’ t say we solved hallucinations ” In online programming contests known as Codeforces competitions , this new model reached the 89th percentile of participants , and OpenAI claims the next update of this model will perform “ similarly to PhD students on challenging benchmark tasks in physics , chemistry and biology. ” At the same time , o1 is not as capable as GPT-4o in a lot of areas . It doesn ’ t do as well on factual knowledge about the world . It also doesn ’ t have the ability to browse the web or process files and images . Still , the company believes it represents a brand-new class of capabilities . It was named o1 to indicate “ resetting the counter back to 1. ” “ I ’ m gon na be honest : I think we ’ re terrible at naming , traditionally , ” McGrew says .
+LLM Answer: According to the text, several things set o1 apart from GPT-4o:
+
+* In online programming contests (Codeforces competitions), o1 scored 83% correct solutions compared to GPT-4o's 13%.
+* The training behind o1 is "fundamentally different" from its predecessors, including a completely new optimization algorithm and a new training dataset specifically tailored for it.
+* o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance.
+* However, o1 is also less capable than GPT-4o in some areas, particularly with regard to factual knowledge about the world.
+
+LLM Claim: According to the text, several things set o1 apart from GPT-4o:
+
+* In online programming contests (Codeforces competitions), o1 scored 83% correct solutions compared to GPT-4o's 13%.
+Is this claim supported by the context according to bespoke-minicheck? Yes
+
+LLM Claim: * The training behind o1 is "fundamentally different" from its predecessors, including a completely new optimization algorithm and a new training dataset specifically tailored for it.
+Is this claim supported by the context according to bespoke-minicheck? Yes
+
+LLM Claim: * o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance.
+Is this claim supported by the context according to bespoke-minicheck? No
+
+LLM Claim: * However, o1 is also less capable than GPT-4o in some areas, particularly with regard to factual knowledge about the world.
+Is this claim supported by the context according to bespoke-minicheck? Yes
+```
+
+We see that the third claim "* o1 has been shown to perform similarly to PhD students on challenging benchmark tasks in physics, chemistry, and biology, while GPT-4o does not have this level of performance." is not supported by the context. This is because the context only mentions that o1 "is claimed to perform" which is different from "has been shown to perform".
--- a/examples/python-grounded-factuality-rag-check/main.py
+++ b/examples/python-grounded-factuality-rag-check/main.py
@ -0,0 +1,137 @@
+import ollama
+import warnings
+from mattsollamatools import chunker
+from newspaper import Article
+import numpy as np
+from sklearn.neighbors import NearestNeighbors
+import nltk
+
+warnings.filterwarnings(
+    "ignore", category=FutureWarning, module="transformers.tokenization_utils_base"
+)
+nltk.download("punkt", quiet=True)
+
+
+def getArticleText(url):
+    """Gets the text of an article from a URL.
+
+    Often there are a bunch of ads and menus on pages for a news article.
+    This uses newspaper3k to get just the text of just the article.
+    """
+    article = Article(url)
+    article.download()
+    article.parse()
+    return article.text
+
+
+def knn_search(question_embedding, embeddings, k=5):
+    """Performs K-nearest neighbors (KNN) search"""
+    X = np.array(
+        [item["embedding"] for article in embeddings for item in article["embeddings"]]
+    )
+    source_texts = [
+        item["source"] for article in embeddings for item in article["embeddings"]
+    ]
+
+    # Fit a KNN model on the embeddings
+    knn = NearestNeighbors(n_neighbors=k, metric="cosine")
+    knn.fit(X)
+
+    # Find the indices and distances of the k-nearest neighbors.
+    _, indices = knn.kneighbors(question_embedding, n_neighbors=k)
+
+    # Get the indices and source texts of the best matches
+    best_matches = [(indices[0][i], source_texts[indices[0][i]]) for i in range(k)]
+
+    return best_matches
+
+
+def check(document, claim):
+    """Checks if the claim is supported by the document by calling bespoke-minicheck.
+
+    Returns Yes/yes if the claim is supported by the document, No/no otherwise.
+    Support for logits will be added in the future.
+
+    bespoke-minicheck's system prompt is defined as:
+      'Determine whether the provided claim is consistent with the corresponding
+      document. Consistency in this context implies that all information presented in the claim
+      is substantiated by the document. If not, it should be considered inconsistent. Please
+      assess the claim's consistency with the document by responding with either "Yes" or "No".'
+
+    bespoke-minicheck's user prompt is defined as:
+      "Document: {document}\nClaim: {claim}"
+    """
+    prompt = f"Document: {document}\nClaim: {claim}"
+    response = ollama.generate(
+        model="bespoke-minicheck", prompt=prompt, options={"num_predict": 2, "temperature": 0.0}
+    )
+    return response["response"].strip()
+
+
+if __name__ == "__main__":
+    allEmbeddings = []
+    default_url = "https://www.theverge.com/2024/9/12/24242439/openai-o1-model-reasoning-strawberry-chatgpt"
+    user_input = input(
+        "Enter the URL of an article you want to chat with, or press Enter for default example: "
+    )
+    article_url = user_input.strip() if user_input.strip() else default_url
+    article = {}
+    article["embeddings"] = []
+    article["url"] = article_url
+    text = getArticleText(article_url)
+    chunks = chunker(text)
+
+    # Embed (batch) chunks using ollama
+    embeddings = ollama.embed(model="all-minilm", input=chunks)["embeddings"]
+
+    for chunk, embedding in zip(chunks, embeddings):
+        item = {}
+        item["source"] = chunk
+        item["embedding"] = embedding
+        item["sourcelength"] = len(chunk)
+        article["embeddings"].append(item)
+
+    allEmbeddings.append(article)
+
+    print(f"\nLoaded, chunked, and embedded text from {article_url}.\n")
+
+    while True:
+        # Input a question from the user
+        # For example, "Who is the chief research officer?"
+        question = input("Enter your question or type quit: ")
+
+        if question.lower() == "quit":
+            break
+
+        # Embed the user's question using ollama.embed
+        question_embedding = ollama.embed(model="all-minilm", input=question)[
+            "embeddings"
+        ]
+
+        # Perform KNN search to find the best matches (indices and source text)
+        best_matches = knn_search(question_embedding, allEmbeddings, k=4)
+
+        sourcetext = "\n\n".join([source_text for (_, source_text) in best_matches])
+
+        print(f"\nRetrieved chunks: \n{sourcetext}\n")
+
+        # Give the retreived chunks and question to the chat model
+        system_prompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"
+
+        ollama_response = ollama.generate(
+            model="llama3.1",
+            prompt=question,
+            system=system_prompt,
+            options={"stream": False},
+        )
+
+        answer = ollama_response["response"]
+        print(f"LLM Answer:\n{answer}\n")
+
+        # Check each sentence in the response for grounded factuality
+        if answer:
+            for claim in nltk.sent_tokenize(answer):
+                print(f"LLM Claim: {claim}")
+                print(
+                    f"Is this claim supported by the context according to bespoke-minicheck? {check(sourcetext, claim)}\n"
+                )
--- a/examples/python-grounded-factuality-rag-check/requirements.txt
+++ b/examples/python-grounded-factuality-rag-check/requirements.txt
@ -0,0 +1,8 @@
+ollama
+lxml==5.3.0
+lxml_html_clean==0.2.2
+mattsollamatools==0.0.25
+newspaper3k==0.2.8
+nltk==3.9.1
+numpy==1.26.4
+scikit-learn==1.5.2
--- a/examples/python-grounded-factuality-simple-check/main.py
+++ b/examples/python-grounded-factuality-simple-check/main.py
@ -0,0 +1,53 @@
+"""Simple example to demonstrate how to use the bespoke-minicheck model."""
+
+import ollama
+
+# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
+
+
+def check(document, claim):
+    """Checks if the claim is supported by the document by calling bespoke-minicheck.
+
+    Returns Yes/yes if the claim is supported by the document, No/no otherwise.
+    Support for logits will be added in the future.
+
+    bespoke-minicheck's system prompt is defined as:
+      'Determine whether the provided claim is consistent with the corresponding
+      document. Consistency in this context implies that all information presented in the claim
+      is substantiated by the document. If not, it should be considered inconsistent. Please
+      assess the claim's consistency with the document by responding with either "Yes" or "No".'
+
+    bespoke-minicheck's user prompt is defined as:
+      "Document: {document}\nClaim: {claim}"
+    """
+    prompt = f"Document: {document}\nClaim: {claim}"
+    response = ollama.generate(
+        model="bespoke-minicheck", prompt=prompt, options={"num_predict": 2, "temperature": 0.0}
+    )
+    return response["response"].strip()
+
+
+def get_user_input(prompt):
+    user_input = input(prompt)
+    if not user_input:
+        exit()
+    print()
+    return user_input
+
+
+def main():
+    while True:
+        # Get a document from the user (e.g. "Ryan likes running and biking.")
+        document = get_user_input("Enter a document: ")
+        # Get a claim from the user (e.g. "Ryan likes to run.")
+        claim = get_user_input("Enter a claim: ")
+        # Check if the claim is supported by the document
+        grounded_factuality_check = check(document, claim)
+        print(
+            f"Is the claim supported by the document according to bespoke-minicheck? {grounded_factuality_check}"
+        )
+        print("\n\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/python-grounded-factuality-simple-check/readme.md
+++ b/examples/python-grounded-factuality-simple-check/readme.md
@ -0,0 +1,54 @@
+# Simple Bespoke-Minicheck Example
+
+`bespoke-minicheck` is a model for checking if a claim is supported by a document. It is used through the **generate** endpoint, which is called in this example with a `prompt` that includes the expected formatting of the user input. 
+
+## Running the Example
+
+1. Ensure you have the `bespoke-minicheck` model installed:
+
+   ```bash
+   ollama pull bespoke-minicheck
+   ```
+
+2. Install the dependencies:
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+3. Run the program:
+
+   ```bash
+   python main.py
+   ```
+
+4. Enter a document and a claim when prompted:
+
+   ```bash
+   Enter a document: Roses are red.
+
+   Enter a claim: Roses are blue. 
+   ```
+
+   The claim and document are then given to the `bespoke-minicheck` as inputs, which then generates a response (Yes or No) on whether the claim is supported by the document.
+
+   ```bash
+   Is the claim supported by the document according to bespoke-minicheck? No
+   ```
+
+## More Examples
+
+Document ([source](https://en.wikipedia.org/wiki/Apple_I)): 
+> The Apple Computer 1 (Apple-1[a]), later known predominantly as the Apple I(written with a Roman numeral),[b] is an 8-bit motherboard-only personal computer designed by Steve Wozniak[5][6] and released by the Apple Computer Company (now Apple Inc.) in 1976. The company was initially formed to sell the Apple I – its first product – and would later become the world's largest technology company.[7] The idea of starting a company and selling the computer came from Wozniak's friend and Apple co-founder Steve Jobs.[8][9] One of the main innovations of the Apple I was that it included video display terminal circuitry on its circuit board, allowing it to connect to a low-cost composite video monitor or television, instead of an expensive computer terminal, compared to most existing computers at the time.
+
+Claim: 
+>The Apple I is a 16-bit computer.
+
+Expected output:
+>Is the claim supported by the document according to bespoke-minicheck? **No**
+
+Claim: 
+>Apple was originally called the Apple Computer Company.
+
+Expected output:
+>Is the claim supported by the document according to bespoke-minicheck? **Yes**
--- a/examples/python-grounded-factuality-simple-check/requirements.txt
+++ b/examples/python-grounded-factuality-simple-check/requirements.txt
@ -0,0 +1 @@
+ollama
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@ -42,7 +42,7 @@ const (

 var (
 	// Used to validate if the given ROCm lib is usable
-	ROCmLibGlobs          = []string{"libhipblas.so.2*", "rocblas"} // TODO - probably include more coverage of files here...
+	ROCmLibGlobs          = []string{"libhipblas.so.1*", "rocblas"} // TODO - probably include more coverage of files here...
 	RocmStandardLocations = []string{"/opt/rocm/lib", "/usr/lib64"}
 )

@ -391,7 +391,7 @@ func AMDValidateLibDir() (string, error) {
 	}

 	// Well known ollama installer path
-	installedRocmDir := "/usr/share/ollama/lib/rocm"
+	installedRocmDir := "/opt/rocm-5.7.1"
 	if rocmLibUsable(installedRocmDir) {
 		return installedRocmDir, nil
 	}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -59,7 +59,7 @@ var (
 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}

-var RocmComputeMin = 9
+var RocmComputeMin = 8

 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -22,6 +22,7 @@ amdGPUs() {
        return
    fi
    GPU_LIST=(
+	"gfx803"
        "gfx900"
        "gfx906:xnack-"
        "gfx908:xnack-"
--- a/scripts/build.sh
+++ b/scripts/build.sh
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@ -23,7 +23,7 @@ docker buildx build \
        .

 # buildx behavior changes for single vs. multiplatform
-if echo $PLATFORM | grep "," > /dev/null ; then 
+if echo $PLATFORM | grep "," > /dev/null ; then
        mv -f ./dist/linux_*64/ollama* ./dist/
        rmdir ./dist/linux_*64
 fi
--- a/scripts/env.sh
+++ b/scripts/env.sh
@ -3,7 +3,7 @@
 export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
 export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
 # TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
-PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
+PLATFORM=${PLATFORM:-"linux/amd64"}
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
 RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
@ -11,4 +11,4 @@ OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OL

 echo "Building Ollama"
 echo "VERSION=$VERSION"
-echo "PLATFORM=$PLATFORM"
+echo "PLATFORM=$PLATFORM"
Author	SHA1	Message	Date
mnccouk	900b4afcd1	Update README.md	2024-09-21 14:26:19 +01:00
Matt	9a8830ba9b	Updated version tag version number, use default 0.0.0	2024-09-21 14:21:08 +01:00
Matt	1ac12871e3	Changed to amd64 only build and also added docker-compose.yml	2024-09-21 13:47:19 +01:00
Matt	2d680d5527	Merge branch 'rx580_gpu' of https://github.com/mnccouk/ollama into rx580_gpu	2024-09-20 20:14:03 +01:00
mnc	22a28b7f0a	Merge remote-tracking branch 'origin/main' into rx580_gpu	2024-09-20 20:13:05 +01:00
Matt	061e8e49f6	Merge branch 'rx580_gpu' of https://github.com/mnccouk/ollama into rx580_gpu	2024-09-20 20:04:23 +01:00
mnc	43d22dc9f1	Merge branch 'main' into rx580_gpu # Conflicts: # Dockerfile # scripts/build_docker.sh # scripts/build_linux.sh	2024-09-20 20:00:14 +01:00
Matt	b0c75fd057	Merge branch 'rx580_gpu' of https://github.com/mnccouk/ollama into rx580_gpu	2024-09-19 21:11:45 +01:00
Matt	a59776dcb9	Added to README	2024-09-19 21:09:39 +01:00
Patrick Devine	5804cf1723	documentation for stopping a model (#6766 )	2024-09-18 16:26:42 -07:00
Ryan Marten	bf7ee0f4d4	examples: add python examples for `bespoke-minicheck` (#6841 )	2024-09-18 09:35:25 -07:00
mnccouk	c4e4ea6019	Update README.md	2024-09-15 16:26:23 +01:00
mnccouk	8fbc5f571a	Update README.md	2024-09-15 16:07:28 +01:00
Matt	7965511b9e	Added to README	2024-09-15 16:03:58 +01:00
Matt	3449201ce4	Changed to bild for rx580 GPU, this uses 5.7.1 rocm libraries	2024-09-15 14:59:52 +01:00