disable avx while still allowing gpu support

as per discussion for this issue and the most recent comment on how to fix this issue, at least temporarily, here: https://github.com/ollama/ollama/issues/2187#issuecomment-2262876198
llm: add solar pro (preview) (#6846 )
2024-09-22 13:27:17 -04:00 · 2024-09-17 18:11:26 -07:00 · 2024-09-17 18:06:16 -07:00 · 2024-09-17 16:33:23 -07:00 · 2024-09-17 15:26:40 -07:00 · 2024-09-16 17:31:37 -07:00
21 changed files with 694 additions and 133 deletions
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@ -104,6 +104,7 @@ jobs:
          path: |
            build/**/*
            build/**/*.a
+            llm/build/**/*.a
            dist/windows-amd64/**

  # ROCm generation step
@ -359,7 +360,7 @@ jobs:
    environment: release
    runs-on: linux
    env:
-      BUILD_ARCH: amd64
+      PLATFORM: linux/amd64
    steps:
      - uses: actions/checkout@v4
        with:
@ -382,7 +383,7 @@ jobs:
    environment: release
    runs-on: linux-arm64
    env:
-      BUILD_ARCH: arm64
+      PLATFORM: linux/arm64
    steps:
      - uses: actions/checkout@v4
        with:
@ -421,7 +422,7 @@ jobs:
            !dist/*-cov

  # Container image build
-  build-linux:
+  build-container-image:
    environment: release
    strategy:
      matrix:
@ -459,7 +460,6 @@ jobs:
          flavor: |
            latest=false
          tags: |
-            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
@ -503,7 +503,7 @@ jobs:
    environment: release
    runs-on: linux
    needs:
-      - build-linux
+      - build-container-image
    env:
      FINAL_IMAGE_REPO: ollama/ollama
    steps:
@ -526,7 +526,6 @@ jobs:
          flavor: |
            latest=false
          tags: |
-            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
@ -551,7 +550,7 @@ jobs:
      - name: Inspect image
        run: |
          docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}          
-  build-linux-rocm:
+  build-container-image-rocm:
    environment: release
    runs-on: linux
    env:
@ -570,7 +569,6 @@ jobs:
          flavor: |
            latest=false
          tags: |
-            type=ref,event=tag
            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
            type=semver,pattern={{version}}
      - name: Set Version
@ -592,7 +590,7 @@ jobs:
          target: runtime-rocm
          build-args: |
            GOFLAGS
-          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
+          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm
          push: true

  # Aggregate all the assets and ship a release
@ -625,8 +623,6 @@ jobs:
          ls -lh dist/
          (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
          mv sha256sum.txt dist/
-          mv dist/linux-???64 .
-          mv dist/linux-amd64-rocm .
          cat dist/sha256sum.txt
      - name: Create or update Release
        run: |
--- a/README.md
+++ b/README.md
@ -393,6 +393,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
 - [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
 - [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
+- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)

 ### Mobile

--- a/docs/import.md
+++ b/docs/import.md
@ -38,7 +38,7 @@ Ollama supports importing adapters based on several different model architecture

 You can create the adapter using a fine tuning framework or tool which can output adapters in the Safetensors format, such as:

-  * Hugging Face [fine tuning framework] (https://huggingface.co/docs/transformers/en/training)
+  * Hugging Face [fine tuning framework](https://huggingface.co/docs/transformers/en/training)
  * [Unsloth](https://github.com/unslothai/unsloth)
  * [MLX](https://github.com/ml-explore/mlx)

--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@ -17,7 +17,8 @@ func GetCPUCapability() CPUCapability {
 		return CPUCapabilityAVX
 	}
 	// else LCD
-	return CPUCapabilityNone
+	//return CPUCapabilityNone
+	return CPUCapabilityAVX
 }

 func IsNUMA() bool {
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@ -69,22 +69,10 @@ git_module_setup() {
 }

 apply_patches() {
-    # Wire up our CMakefile
-    if ! grep ollama ${LLAMACPP_DIR}/CMakeLists.txt; then
-        echo 'add_subdirectory(../ext_server ext_server) # ollama' >>${LLAMACPP_DIR}/CMakeLists.txt
-    fi
-
-    if [ -n "$(ls -A ../patches/*.diff)" ]; then
-        # apply temporary patches until fix is upstream
-        for patch in ../patches/*.diff; do
-            for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
-                (cd ${LLAMACPP_DIR}; git checkout ${file})
-            done
-        done
-        for patch in ../patches/*.diff; do
-            (cd ${LLAMACPP_DIR} && git apply ${patch})
-        done
-    fi
+    # apply temporary patches until fix is upstream
+    for patch in ../patches/*.patch; do
+        git -c 'user.name=nobody' -c 'user.email=<>' -C ${LLAMACPP_DIR} am ${patch}
+    done
 }

 build() {
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -52,7 +52,8 @@ if [ -z "${CUDACXX}" ]; then
        export CUDACXX=$(command -v nvcc)
    fi
 fi
-COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+# COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -83,29 +83,9 @@ function git_module_setup {
 }

 function apply_patches {
-    # Wire up our CMakefile
-    if (!(Select-String -Path "${script:llamacppDir}/CMakeLists.txt" -Pattern 'ollama')) {
-        Add-Content -Path "${script:llamacppDir}/CMakeLists.txt" -Value 'add_subdirectory(../ext_server ext_server) # ollama'
-    }
-
    # Apply temporary patches until fix is upstream
-    $patches = Get-ChildItem "../patches/*.diff"
-    foreach ($patch in $patches) {
-        # Extract file paths from the patch file
-        $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
-            $parts = $_ -split ' '
-            ($parts[1] -split '/', 2)[1]
-        }
-
-        # Checkout each file
-        foreach ($file in $filePaths) {
-            git -C "${script:llamacppDir}" checkout $file
-        }
-    }
-
-    # Apply each patch
-    foreach ($patch in $patches) {
-        git -C "${script:llamacppDir}" apply $patch.FullName
+    foreach ($patch in $(Get-ChildItem "../patches/*.patch")) {
+        git -c 'user.name=nobody' -c 'user.email=<>' -C "${script:llamacppDir}" am $patch.FullName
    }
 }

--- a/llm/patches/0000-cmakelist.patch
+++ b/llm/patches/0000-cmakelist.patch
@ -0,0 +1,22 @@
+From 8b8d83ffca775840acc5dc700f3b3703e9f5cfe4 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Fri, 23 Aug 2024 11:27:48 -0700
+Subject: [PATCH] patch cmakelist
+
+---
+ CMakeLists.txt | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index a3132063..6a2a9912 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -199,3 +199,5 @@ if (LLAMA_BUILD_EXAMPLES)
+     add_subdirectory(examples)
+     add_subdirectory(pocs)
+ endif()
+
+add_subdirectory(../ext_server ext_server) # ollama
+-- 
+2.45.2
+
--- a/llm/patches/0001-load-progress.patch
+++ b/llm/patches/0001-load-progress.patch
@ -1,8 +1,18 @@
+From 2cfaa0a04faa9c87ba8f1ac8527eb953e69c6cde Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:10 -0700
+Subject: [PATCH] 01-load-progress.diff
+
+---
+ common/common.cpp | 2 ++
+ common/common.h   | 7 +++++++
+ 2 files changed, 9 insertions(+)
+
 diff --git a/common/common.cpp b/common/common.cpp
-index 2c05a4d4..927f0e3d 100644
+index 9fa18472..48ff41e9 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -2093,6 +2093,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
+@@ -2573,6 +2573,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
@ -12,10 +22,10 @@ index 2c05a4d4..927f0e3d 100644
         mparams.kv_overrides = NULL;
     } else {
 diff --git a/common/common.h b/common/common.h
-index 65c0ef81..ebca2c77 100644
+index cb5e7f6d..d8f043f7 100644
 --- a/common/common.h
 +++ b/common/common.h
-@@ -184,6 +184,13 @@ struct gpt_params {
+@@ -204,6 +204,13 @@ struct gpt_params {
     std::string mmproj = "";        // path to multimodal projector
     std::vector<std::string> image; // path to image file(s)
 
@ -29,3 +39,6 @@ index 65c0ef81..ebca2c77 100644
     // embedding
     bool embedding         = false; // get only sentence embedding
     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
+-- 
+2.46.0
+
--- a/llm/patches/0002-clip-log.patch
+++ b/llm/patches/0002-clip-log.patch
@ -1,5 +1,14 @@
+From ba4bba80a744f76ac67b8234451c259a3c5da83b Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:11 -0700
+Subject: [PATCH] 02-clip-log.diff
+
+---
+ examples/llava/clip.cpp | 1 +
+ 1 file changed, 1 insertion(+)
+
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index e431c7f7..f077e688 100644
+index 9b890571..cb51793d 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
@ -10,3 +19,6 @@ index e431c7f7..f077e688 100644
 #include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
+-- 
+2.46.0
+
--- a/llm/patches/0003-load_exception.patch
+++ b/llm/patches/0003-load_exception.patch
@ -1,8 +1,17 @@
+From e43bfd3f607a6dfcaba2d490d35f412a52e55e30 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:12 -0700
+Subject: [PATCH] 03-load_exception.diff
+
+---
+ src/llama.cpp | 25 ++++++++++++++++---------
+ 1 file changed, 16 insertions(+), 9 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 73f52435..58a00fb1 100644
+index 88355971..926bb71a 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -7241,7 +7241,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+@@ -8635,7 +8635,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
@ -11,7 +20,7 @@ index 73f52435..58a00fb1 100644
     }
 
     return 0;
-@@ -17564,16 +17564,23 @@ struct llama_model * llama_load_model_from_file(
+@@ -18022,16 +18022,23 @@ struct llama_model * llama_load_model_from_file(
         }
         model->rpc_servers.push_back(servers);
     }
@ -43,3 +52,6 @@ index 73f52435..58a00fb1 100644
     }
 
     return model;
+-- 
+2.46.0
+
--- a/llm/patches/0004-metal.patch
+++ b/llm/patches/0004-metal.patch
@ -1,8 +1,17 @@
+From 29411d9a9d2b6a0af6425ffe88498f17f71f7d5d Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:12 -0700
+Subject: [PATCH] 04-metal.diff
+
+---
+ ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
+ 1 file changed, 13 insertions(+), 17 deletions(-)
+
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
-index 0207b787..b5e9884b 100644
+index 91b5e61b..9cfa72ac 100644
 --- a/ggml/src/ggml-metal.m
 +++ b/ggml/src/ggml-metal.m
-@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
+@@ -1734,27 +1734,23 @@ static enum ggml_status ggml_metal_graph_compute(
                         // to the matrix-vector kernel
                         int ne11_mm_min = 1;
 
@ -43,3 +52,6 @@ index 0207b787..b5e9884b 100644
 
                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+-- 
+2.46.0
+
--- a/llm/patches/0005-default-pretokenizer.patch
+++ b/llm/patches/0005-default-pretokenizer.patch
@ -1,5 +1,14 @@
+From b298ac8614d1e38da28f760eb1d2ae8af0fbbe62 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:13 -0700
+Subject: [PATCH] 05-default-pretokenizer.diff
+
+---
+ src/llama.cpp | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 88355971..dd7d41ed 100644
+index 926bb71a..d1e959fc 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
@ -30,3 +39,6 @@ index 88355971..dd7d41ed 100644
             }
         } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+-- 
+2.46.0
+
--- a/llm/patches/0006-embeddings.patch
+++ b/llm/patches/0006-embeddings.patch
@ -1,8 +1,17 @@
+From c9a6ca9fc039233dee746a4da9705762cd9e515d Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:14 -0700
+Subject: [PATCH] 06-embeddings.diff
+
+---
+ src/llama.cpp | 17 ++++++++++-------
+ 1 file changed, 10 insertions(+), 7 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 88355971..d7db689b 100644
+index d1e959fc..f79bd782 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -15898,7 +15898,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
     const auto n_embd  = hparams.n_embd;
 
     // TODO: use a per-batch flag for logits presence instead
@ -11,7 +20,7 @@ index 88355971..d7db689b 100644
     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
 
     const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
+@@ -16167,20 +16167,23 @@ static int llama_decode_internal(
             // no output
             res  = nullptr;
             embd = nullptr;
@ -41,3 +50,6 @@ index 88355971..d7db689b 100644
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched, gf);
+-- 
+2.46.0
+
--- a/llm/patches/0007-clip-unicode.patch
+++ b/llm/patches/0007-clip-unicode.patch
@ -1,8 +1,17 @@
+From ae2b188a679c83ce105aa1e823499441dfab3c57 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:15 -0700
+Subject: [PATCH] 07-clip-unicode.diff
+
+---
+ examples/llava/clip.cpp | 23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 95fbe3d0..5a02a6ec 100644
+index cb51793d..8716472b 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
-@@ -32,6 +33,14 @@
+@@ -41,6 +41,14 @@
 #include <cinttypes>
 #include <limits>
 
@ -17,7 +26,7 @@ index 95fbe3d0..5a02a6ec 100644
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
-@@ -1055,7 +1064,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+@@ -1223,7 +1231,22 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             return nullptr;
         }
 
@ -40,3 +49,6 @@ index 95fbe3d0..5a02a6ec 100644
         if (!fin) {
             LOG_TEE("cannot open model file for loading tensors\n");
             clip_free(new_clip);
+-- 
+2.46.0
+
--- a/llm/patches/0008-solar-pro.patch
+++ b/llm/patches/0008-solar-pro.patch
@ -0,0 +1,402 @@
+From 8313ce5f43f11f3d84f352f97f3802792e90e18c Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:16 -0700
+Subject: [PATCH] add solar-pro support
+
+solar-pro introduces block skip connections where blocks are connected
+to other, non-sequential blocks with a scale multiple
+
+this change adds 4 new keys to store the skip connections and one new
+tensor to store the scalar. the scalar is implemented a 1-dimensional
+tensor with 2 elements dervied from the model's bskcn_tv configuration.
+in general, the values are (bskcn_tv, 1 - bskcn_tv)
+---
+ src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 254 insertions(+), 13 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index f79bd782..b7771f53 100644
+--- a/src/llama.cpp
+++ b/src/llama.cpp
+@@ -213,6 +213,7 @@ enum llm_arch {
+     LLM_ARCH_NEMOTRON,
+     LLM_ARCH_EXAONE,
+     LLM_ARCH_RWKV6,
+    LLM_ARCH_SOLAR,
+     LLM_ARCH_UNKNOWN,
+ };
+ 
+@@ -261,6 +262,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+     { LLM_ARCH_NEMOTRON,        "nemotron"     },
+     { LLM_ARCH_EXAONE,          "exaone"       },
+     { LLM_ARCH_RWKV6,           "rwkv6"        },
+    { LLM_ARCH_SOLAR,           "solar"        },
+     { LLM_ARCH_UNKNOWN,         "(unknown)"    },
+ };
+ 
+@@ -314,6 +316,7 @@ enum llm_kv {
+     LLM_KV_ATTENTION_KV_LORA_RANK,
+     LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
+     LLM_KV_ATTENTION_SLIDING_WINDOW,
+    LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ 
+     LLM_KV_ROPE_DIMENSION_COUNT,
+     LLM_KV_ROPE_FREQ_BASE,
+@@ -405,19 +408,20 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
+     { LLM_KV_TIME_MIX_EXTRA_DIM,                "%s.time_mix_extra_dim"                },
+     { LLM_KV_TIME_DECAY_EXTRA_DIM,              "%s.time_decay_extra_dim"              },
+ 
+-    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
+-    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
+-    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"         },
+-    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"              },
+-    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"             },
+-    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"           },
+-    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"     },
+-    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon" },
+-    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                 },
+-    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"            },
+-    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"           },
+-    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
+-    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"         },
+    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"               },
+    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"            },
+    { LLM_KV_ATTENTION_MAX_ALIBI_BIAS,         "%s.attention.max_alibi_bias"           },
+    { LLM_KV_ATTENTION_CLAMP_KQV,              "%s.attention.clamp_kqv"                },
+    { LLM_KV_ATTENTION_KEY_LENGTH,             "%s.attention.key_length"               },
+    { LLM_KV_ATTENTION_VALUE_LENGTH,           "%s.attention.value_length"             },
+    { LLM_KV_ATTENTION_LAYERNORM_EPS,          "%s.attention.layer_norm_epsilon"       },
+    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,      "%s.attention.layer_norm_rms_epsilon"   },
+    { LLM_KV_ATTENTION_CAUSAL,                 "%s.attention.causal"                   },
+    { LLM_KV_ATTENTION_Q_LORA_RANK,            "%s.attention.q_lora_rank"              },
+    { LLM_KV_ATTENTION_KV_LORA_RANK,           "%s.attention.kv_lora_rank"             },
+    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count"   },
+    { LLM_KV_ATTENTION_SLIDING_WINDOW,         "%s.attention.sliding_window"           },
+    { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,  "%s.attention.block_skip_connection.%d" },
+ 
+     { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
+     { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
+@@ -589,6 +593,7 @@ enum llm_tensor {
+     LLM_TENSOR_ENC_FFN_DOWN,
+     LLM_TENSOR_ENC_FFN_UP,
+     LLM_TENSOR_ENC_OUTPUT_NORM,
+    LLM_TENSOR_BSKCN_TV,
+ };
+ 
+ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
+@@ -1408,6 +1413,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
+             { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,    "blk.%d.channel_mix_receptance" },
+         },
+     },
+    {
+        LLM_ARCH_SOLAR,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_BSKCN_TV,        "bskcn_tv" },
+        },
+    },
+     {
+         LLM_ARCH_UNKNOWN,
+         {
+@@ -2237,6 +2260,7 @@ enum e_model {
+     MODEL_15B,
+     MODEL_16B,
+     MODEL_20B,
+    MODEL_22B,
+     MODEL_30B,
+     MODEL_34B,
+     MODEL_35B,
+@@ -2284,6 +2308,8 @@ struct llama_hparams {
+     std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
+     std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
+ 
+    std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
+
+     uint32_t n_layer_dense_lead = 0;
+     uint32_t n_lora_q = 0;
+     uint32_t n_lora_kv = 0;
+@@ -2349,6 +2375,7 @@ struct llama_hparams {
+         if (this->n_head_arr    != other.n_head_arr)    return true;
+         if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
+         if (this->n_ff_arr      != other.n_ff_arr)      return true;
+        if (this->n_bskcn_arr   != other.n_bskcn_arr)   return true;
+ 
+         if (this->n_rel_attn_bkts    != other.n_rel_attn_bkts)    return true;
+         if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
+@@ -2455,6 +2482,14 @@ struct llama_hparams {
+             return ssm_d_state * ssm_d_inner;
+         }
+     }
+
+    bool n_bskcn(uint32_t n, uint32_t il = 0) const {
+        if (il < n_layer) {
+            return n_bskcn_arr[n][il] > 0;
+        }
+
+        GGML_ABORT("fatal error");
+    }
+ };
+ 
+ static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
+@@ -2635,6 +2670,8 @@ struct llama_layer {
+     struct ggml_tensor * ffn_gate_scale;
+     struct ggml_tensor * ffn_up_scale;
+     struct ggml_tensor * ffn_down_scale;
+
+    struct ggml_tensor * bskcn_tv;
+ };
+ 
+ // very similar to llama_batch,
+@@ -5937,6 +5974,21 @@ static void llm_load_hparams(
+                     default: model.type = e_model::MODEL_UNKNOWN;
+                 }
+             } break;
+        case LLM_ARCH_SOLAR:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+                for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
+                    auto & bskcn = hparams.n_bskcn_arr.at(i);
+                    bskcn.fill(0);
+                    ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
+                }
+
+                switch (hparams.n_layer) {
+                    case 64: model.type = e_model::MODEL_22B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            }
+         default: (void)0;
+     }
+ 
+@@ -8420,6 +8472,38 @@ static bool llm_load_tensors(
+                     }
+ 
+                 } break;
+            case LLM_ARCH_SOLAR:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+                    }
+                } break;
+             default:
+                 throw std::runtime_error("unknown architecture");
+         }
+@@ -15173,6 +15257,158 @@ struct llm_build_context {
+ 
+         return gf;
+     }
+
+    ggml_cgraph * build_solar() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        struct ggml_tensor * bskcn_1;
+        struct ggml_tensor * bskcn_2;
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            if (hparams.n_bskcn(0, il)) {
+                bskcn_1 = inpSA;
+            }
+
+            if (hparams.n_bskcn(1, il)) {
+                bskcn_2 = inpSA;
+            }
+
+            if (hparams.n_bskcn(2, il)) {
+                inpSA = ggml_add(
+                   ctx0,
+                   ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+            }
+
+            if (hparams.n_bskcn(3, il)) {
+                inpSA = ggml_add(
+                   ctx0,
+                   ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
+                   ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
+            }
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self-attention
+            {
+                // rope freq factors for llama3; may return nullptr for llama2 and other models
+                struct ggml_tensor * rope_factors = build_rope_factors(il);
+
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
+
+                Qcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, model.layers[il].bo,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+            cb(cur, "ffn_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cb(cur, "ffn_out", il);
+
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+ };
+ 
+ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
+@@ -15423,6 +15659,10 @@ static struct ggml_cgraph * llama_build_graph(
+             {
+                 result = llm.build_rwkv6();
+             } break;
+        case LLM_ARCH_SOLAR:
+            {
+                result = llm.build_solar();
+            } break;
+         default:
+             GGML_ABORT("fatal error");
+     }
+@@ -18503,6 +18743,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+         case LLM_ARCH_ARCTIC:
+         case LLM_ARCH_DEEPSEEK2:
+         case LLM_ARCH_CHATGLM:
+        case LLM_ARCH_SOLAR:
+             return LLAMA_ROPE_TYPE_NORM;
+ 
+         // the pairs of head values are offset by n_rot/2
+-- 
+2.46.0
+
--- a/scripts/tag_latest.sh
+++ b/scripts/tag_latest.sh
@ -2,32 +2,12 @@

 set -eu

-# We use 2 different image repositories to handle combining architecture images into multiarch manifest
-# (The ROCm image is x86 only and is not a multiarch manifest)
 # For developers, you can override the DOCKER_ORG to generate multiarch manifests
-#  DOCKER_ORG=jdoe VERSION=0.1.30 PUSH=1 ./scripts/tag_latest.sh
+#  DOCKER_ORG=jdoe VERSION=0.1.30 ./scripts/tag_latest.sh
 DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
 FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}

-# Set PUSH to a non-empty string to trigger push instead of load
-PUSH=${PUSH:-""}
-
-echo "Assembling manifest and tagging latest"
-docker manifest rm ${FINAL_IMAGE_REPO}:latest || true
-docker manifest create ${FINAL_IMAGE_REPO}:latest \
-    ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
-    ${RELEASE_IMAGE_REPO}:$VERSION-arm64
-
-docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
-docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:rocm
-
-if [ -n "${PUSH}" ]; then
-    echo "Pushing latest tags up..."
-    docker manifest push ${FINAL_IMAGE_REPO}:latest
-    docker push ${FINAL_IMAGE_REPO}:rocm
-else
-    echo "Not pushing ${FINAL_IMAGE_REPO}:latest and ${FINAL_IMAGE_REPO}:rocm"
-fi
-
-
+echo "Updating ${FINAL_IMAGE_REPO}:latest -> ${FINAL_IMAGE_REPO}:${VERSION}"
+docker buildx imagetools create -t ${FINAL_IMAGE_REPO}:latest ${FINAL_IMAGE_REPO}:${VERSION}
+echo "Updating ${FINAL_IMAGE_REPO}:rocm -> ${FINAL_IMAGE_REPO}:${VERSION}-rocm"
+docker buildx imagetools create -t ${FINAL_IMAGE_REPO}:rocm ${FINAL_IMAGE_REPO}:${VERSION}-rocm
--- a/server/model.go
+++ b/server/model.go
@ -272,6 +272,30 @@ func detectContentType(r io.Reader) (string, error) {
 	return "unknown", nil
 }

+func parseObjects(s string) []map[string]any {
+	var objs []map[string]any
+	for offset := 0; offset < len(s); {
+		var obj map[string]any
+		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
+		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
+			break
+		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
+			// skip over any syntax errors
+			offset += int(syntax.Offset)
+		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
+			// skip over any unmarshalable types
+			offset += int(unmarshalType.Offset)
+		} else if err != nil {
+			return nil
+		} else {
+			offset += int(decoder.InputOffset())
+			objs = append(objs, obj)
+		}
+	}
+
+	return objs
+}
+
 // parseToolCalls attempts to parse a JSON string into a slice of ToolCalls.
 // mxyng: this only really works if the input contains tool calls in some JSON format
 func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
@ -304,16 +328,14 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		return nil, false
 	}

-	var kv map[string]any
-	// execute the subtree with placeholders to identify the keys
-	// trim any commands that might exist in the template
-	if err := json.Unmarshal(bytes.TrimSuffix(b.Bytes(), []byte(",")), &kv); err != nil {
+	templateObjects := parseObjects(b.String())
+	if len(templateObjects) == 0 {
 		return nil, false
 	}

 	// find the keys that correspond to the name and arguments fields
 	var name, arguments string
-	for k, v := range kv {
+	for k, v := range templateObjects[0] {
 		switch v.(type) {
 		case string:
 			name = k
@ -326,43 +348,32 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
 		return nil, false
 	}

-	var objs []map[string]any
-	for offset := 0; offset < len(s); {
-		var obj map[string]any
-		decoder := json.NewDecoder(strings.NewReader(s[offset:]))
-		if err := decoder.Decode(&obj); errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
-			break
-		} else if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
-			// skip over any syntax errors
-			offset += int(syntax.Offset)
-		} else if unmarshalType := &(json.UnmarshalTypeError{}); errors.As(err, &unmarshalType) {
-			// skip over any unmarshalable types
-			offset += int(unmarshalType.Offset)
-		} else if err != nil {
-			slog.Error("parseToolCalls", "error", err)
-			return nil, false
-		} else {
-			offset += int(decoder.InputOffset())
+	responseObjects := parseObjects(s)
+	if len(responseObjects) == 0 {
+		return nil, false
+	}

-			// collect all nested objects
-			var collect func(any) []map[string]any
-			collect = func(obj any) (all []map[string]any) {
-				switch o := obj.(type) {
-				case map[string]any:
-					all = append(all, o)
-					for _, v := range o {
-						all = append(all, collect(v)...)
-					}
-				case []any:
-					for _, v := range o {
-						all = append(all, collect(v)...)
-					}
-				}
-
-				return all
+	// collect all nested objects
+	var collect func(any) []map[string]any
+	collect = func(obj any) (all []map[string]any) {
+		switch o := obj.(type) {
+		case map[string]any:
+			all = append(all, o)
+			for _, v := range o {
+				all = append(all, collect(v)...)
+			}
+		case []any:
+			for _, v := range o {
+				all = append(all, collect(v)...)
 			}
-			objs = append(objs, collect(obj)...)
 		}
+
+		return all
+	}
+
+	var objs []map[string]any
+	for _, p := range responseObjects {
+		objs = append(objs, collect(p)...)
 	}

 	var toolCalls []api.ToolCall
--- a/server/model_test.go
+++ b/server/model_test.go
@ -69,6 +69,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
 {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
 </tool_call>`, true},
 		{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
+		{"nemotron", `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]} </toolcall>`, true},
 	}

 	var tools []api.Tool
@ -217,3 +218,45 @@ func TestParseLayerFromCopy(t *testing.T) {
 		t.Fatalf("got %d != want 5", len(layers))
 	}
 }
+
+func TestParseObjects(t *testing.T) {
+	tests := []struct {
+		input string
+		want  []map[string]any
+	}{
+		{
+			input: `[{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]`,
+			want: []map[string]any{
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, Canada"}},
+			},
+		},
+		{
+			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall>`,
+			want: []map[string]any{
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
+			},
+		},
+		{
+			input: `<toolcall>{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} </toolcall> <toolcall>{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, ON"}} </toolcall>`,
+			want: []map[string]any{
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "fahrenheit", "location": "San Francisco, CA"}},
+				{"name": "get_current_weather", "arguments": map[string]any{"format": "celsius", "location": "Toronto, ON"}},
+			},
+		},
+		{
+			input: `{"name": "get_current_weather", "arguments": `,
+			want:  nil,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.input, func(t *testing.T) {
+			got := parseObjects(tc.input)
+
+			if diff := cmp.Diff(got, tc.want); diff != "" {
+				t.Errorf("mismatch (-got +want):\n%s", diff)
+			}
+		})
+	}
+}
--- a/server/testdata/tools/nemotron.gotmpl
+++ b/server/testdata/tools/nemotron.gotmpl
@ -0,0 +1,33 @@
+{{- if (or .Tools .System) }}<extra_id_0>System
+{{ if .System }}{{ .System }}
+
+
+{{ end }}
+{{- if .Tools }}
+{{- range .Tools }}<tool> {{ . }} </tool>{{ end }}
+
+
+{{ end }}
+{{- end }}
+{{- range $i, $m := .Messages }}
+{{- $last := eq (len (slice $.Messages $i)) 1 -}}
+{{- if eq .Role "user" }}<extra_id_1>User
+{{ .Content }}
+{{- if $last }}
+<extra_id_1>Assistant
+{{- end }}
+{{ else if eq .Role "tool" }}<extra_id_1>Tool
+{{ .Content }}
+{{- if $last }}
+<extra_id_1>Assistant
+{{- end }}
+{{ else if eq .Role "assistant" }}<extra_id_1>Assistant
+{{- if .ToolCalls }}
+{{ range .ToolCalls }}<toolcall> {"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}} </toolcall> {{ end }}
+{{ else }}
+{{ .Content }}
+{{- if not $last }}
+{{ end }}
+{{- end }}
+{{- end }}
+{{- end }}
--- a/server/testdata/tools/nemotron.out
+++ b/server/testdata/tools/nemotron.out
@ -0,0 +1,18 @@
+<extra_id_0>System
+You are a knowledgable assistant. You can answer questions and perform tasks.
+
+
+<tool> {"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}} </tool>
+
+
+<extra_id_1>User
+What's the weather like today in Paris?
+<extra_id_1>Assistant
+<toolcall> {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}} </toolcall> 
+<extra_id_1>Tool
+22
+<extra_id_1>Assistant
+The current temperature in Paris, France is 22 degrees Celsius.
+<extra_id_1>User
+What's the weather like today in San Francisco and Toronto?
+<extra_id_1>Assistant
Author	SHA1	Message	Date
Gered	47c356a6cd	disable avx while still allowing gpu support Some checks failed release / build-darwin (push) Has been cancelled Details release / generate-windows-cpu (push) Has been cancelled Details release / generate-windows-rocm (push) Has been cancelled Details release / generate-windows-cuda (map[url:https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe version:11]) (push) Has been cancelled Details release / generate-windows-cuda (map[url:https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe version:12]) (push) Has been cancelled Details release / build-windows (push) Has been cancelled Details release / build-linux-amd64 (push) Has been cancelled Details release / build-linux-arm64 (push) Has been cancelled Details release / build-container-image (linux) (push) Has been cancelled Details release / build-container-image (linux-arm64) (push) Has been cancelled Details release / merge (push) Has been cancelled Details release / build-container-image-rocm (push) Has been cancelled Details release / release (push) Has been cancelled Details as per discussion for this issue and the most recent comment on how to fix this issue, at least temporarily, here: https://github.com/ollama/ollama/issues/2187#issuecomment-2262876198	2024-09-22 13:27:17 -04:00
Michael Yang	504a410f02	llm: add solar pro (preview) (#6846 )	2024-09-17 18:11:26 -07:00
Jeffrey Morgan	d05da29912	server: add tool parsing support for nemotron-mini (#6849 )	2024-09-17 18:06:16 -07:00
Michael Yang	72962c6e08	Merge pull request #6833 from ollama/mxyng/git-am make patches git am-able	2024-09-17 16:33:23 -07:00
Michael Yang	7bd7b02712	make patches git am-able raw diffs can be applied using `git apply` but not with `git am`. git patches, e.g. through `git format-patch` are both apply-able and am-able	2024-09-17 15:26:40 -07:00
Daniel Hiltgen	8f9ab5e14d	CI: dist directories no longer present (#6834 ) The new buildx based build no longer leaves the dist/linux-* directories around, so we don't have to clean them up before uploading.	2024-09-16 17:31:37 -07:00
Daniel Hiltgen	7717bb6a84	CI: clean up naming, fix tagging latest (#6832 ) The rocm CI step for RCs was incorrectly tagging them as the latest rocm build. The multiarch manifest was incorrectly tagged twice (with and without the prefix "v"). Static windows artifacts weren't being carried between build jobs. This also fixes the latest tagging script.	2024-09-16 16:18:41 -07:00
Daniel Hiltgen	0ec2915ea7	CI: set platform build build_linux script to keep buildx happy (#6829 ) The runners don't have emulation set up so the default multi-platform build wont work.	2024-09-16 14:07:29 -07:00
Michael Yang	c9a7541b9c	readme: add Agents-Flex to community integrations (#6788 )	2024-09-16 13:42:52 -07:00
Patrick Devine	d81cfd7d6f	fix typo in import docs (#6828 )	2024-09-16 11:48:14 -07:00