diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 555e62a9..30aa2d9e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -70,10 +70,10 @@ jobs:
             os: ubuntu-22.04
             artifact: "linux-2"
           - name: "macOS x64"
-            os: macos-13
+            os: macos-15-intel
             artifact: "mac-x64"
           - name: "macOS arm64"
-            os: macos-14
+            os: macos-26
             artifact: "mac-arm64"
 
     steps:
@@ -194,10 +194,11 @@ jobs:
           sudo apt install vulkan-sdk
 
       - name: Install dependencies on macOS
-        if: matrix.config.name == 'macOS'
+        if: matrix.config.name == 'macOS x64' || matrix.config.name == 'macOS arm64'
         run: |
           brew install cmake ninja
           alias make=cmake
+          cmake --version
 
       - name: Setup & Build
         id: build
@@ -461,7 +462,7 @@ jobs:
 
   model-dependent-tests:
     name: Model dependent tests
-    runs-on: macos-13
+    runs-on: macos-26
     env:
       NODE_LLAMA_CPP_GPU: false
     needs:
@@ -490,7 +491,6 @@ jobs:
 #          sudo apt-get install ninja-build cmake
 
       - name: Install dependencies on macOS
-        if: matrix.config.name == 'macOS'
         run: |
           brew install cmake ninja
           alias make=cmake
@@ -602,10 +602,8 @@ jobs:
       - name: Release
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
           GH_RELEASE_REF: ${{ github.ref }}
         run: |
-          echo "//registry.npmjs.org/:_authToken=\${NPM_TOKEN}" > ~/.npmrc
           export DRY_RUN_RESULT_FILE_PATH="$(pwd)/semanticReleaseDryRunReleaseResult.json"
           
           git apply --ignore-whitespace ./scripts/patches/@semantic-release+github+11.0.0.patch
@@ -638,13 +636,10 @@ jobs:
       - name: Release `create-node-llama-cpp` module
         if: steps.set-npm-url.outputs.npm-url != ''
         env:
-          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
           GH_RELEASE_REF: ${{ github.ref }}
         run: |
           cd packages/create-node-llama-cpp
           
-          echo "//registry.npmjs.org/:_authToken=\${NPM_TOKEN}" > ~/.npmrc
-          
           if [ "$GH_RELEASE_REF" == "refs/heads/beta" ]; then
               npm publish --tag beta
           else
@@ -682,7 +677,7 @@ jobs:
           - name: "Ubuntu"
             os: ubuntu-22.04
           - name: "macOS"
-            os: macos-13
+            os: macos-15-intel
 
     steps:
       - uses: actions/checkout@v4
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index d1b1052b..36828590 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -2,6 +2,7 @@
 #include <algorithm>
 #include <cmath>
 #include "common/common.h"
+#include "llama-vocab.h"
 #include "llama.h"
 
 #include "addonGlobals.h"
@@ -345,8 +346,14 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
                 }
             }
 
-            sampler->acceptToken(new_token_id);
-            result = new_token_id;
+            try {
+                sampler->acceptToken(new_token_id);
+                result = new_token_id;
+            } catch (const std::exception& e) {
+                SetError(std::string("Failed to accept token in sampler: ") + e.what());
+            } catch(...) {
+                SetError("Unknown error when calling \"acceptToken\"");
+            }
         }
         void OnOK() {
             Napi::Number resultToken;
diff --git a/llama/addon/AddonGrammar.cpp b/llama/addon/AddonGrammar.cpp
index 3f061f54..92247437 100644
--- a/llama/addon/AddonGrammar.cpp
+++ b/llama/addon/AddonGrammar.cpp
@@ -49,7 +49,15 @@ Napi::Value AddonGrammar::isTextCompatible(const Napi::CallbackInfo& info) {
     llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(parsed_grammar);
 
     for (const auto & cpt : cpts) {
-        llama_grammar_accept(parsed_grammar, cpt);
+        try {
+            llama_grammar_accept(parsed_grammar, cpt);
+        } catch (const std::exception & e) {
+            llama_grammar_free_impl(parsed_grammar);
+            return Napi::Boolean::New(info.Env(), false);
+        } catch (...) {
+            llama_grammar_free_impl(parsed_grammar);
+            return Napi::Boolean::New(info.Env(), false);
+        }
 
         if (stacks_cur.empty()) {
             // no stacks means that the grammar failed to match at this point
diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp
index 12c9a1ab..fc58de6f 100644
--- a/llama/addon/AddonSampler.cpp
+++ b/llama/addon/AddonSampler.cpp
@@ -1,5 +1,7 @@
 #include <cmath>
 #include "common/common.h"
+#include "globals/addonLog.h"
+#include "ggml.h"
 #include "llama.h"
 
 #include "AddonGrammarEvaluationState.h"
@@ -449,7 +451,15 @@ Napi::Value AddonSampler::AcceptGrammarEvaluationStateToken(const Napi::Callback
     llama_token tokenId = info[1].As<Napi::Number>().Int32Value();
 
     if ((grammar_evaluation_state)->sampler != nullptr) {
-        llama_sampler_accept((grammar_evaluation_state)->sampler, tokenId);
+        try {
+            llama_sampler_accept((grammar_evaluation_state)->sampler, tokenId);
+        } catch (const std::exception & e) {
+            Napi::Error::New(info.Env(), std::string("Failed to accept token in grammar sampler: ") + e.what()).ThrowAsJavaScriptException();
+            return info.Env().Undefined();
+        } catch (...) {
+            Napi::Error::New(info.Env(), "Failed to accept token in grammar sampler").ThrowAsJavaScriptException();
+            return info.Env().Undefined();
+        }
     }
 
     return info.Env().Undefined();
@@ -465,7 +475,14 @@ Napi::Value AddonSampler::CanBeNextTokenForGrammarEvaluationState(const Napi::Ca
         candidates.emplace_back(llama_token_data { tokenId, 1, 0.0f });
 
         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        llama_sampler_apply((grammar_evaluation_state)->sampler, &candidates_p);
+        try {
+            llama_sampler_apply((grammar_evaluation_state)->sampler, &candidates_p);
+        } catch (const std::exception & e) {
+            addonLog(GGML_LOG_LEVEL_DEBUG, std::string("Failed to apply grammar sampler: ") + e.what());
+            return Napi::Boolean::New(info.Env(), false);
+        } catch (...) {
+            return Napi::Boolean::New(info.Env(), false);
+        }
 
         if (candidates_p.size == 0 || candidates_p.data[0].logit == -INFINITY) {
             return Napi::Boolean::New(info.Env(), false);
diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp
index a01a987e..66e45a12 100644
--- a/llama/addon/addon.cpp
+++ b/llama/addon/addon.cpp
@@ -11,6 +11,8 @@
 #include "globals/getSwapInfo.h"
 #include "globals/getMemoryInfo.h"
 
+#include <atomic>
+
 bool backendInitialized = false;
 bool backendDisposed = false;
 
@@ -226,6 +228,11 @@ Napi::Value addonSetNuma(const Napi::CallbackInfo& info) {
     return info.Env().Undefined();
 }
 
+Napi::Value markLoaded(const Napi::CallbackInfo& info) {
+    static std::atomic_bool loaded = false;
+    return Napi::Boolean::New(info.Env(), loaded.exchange(true));
+}
+
 Napi::Value addonInit(const Napi::CallbackInfo& info) {
     if (backendInitialized) {
         Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env());
@@ -266,6 +273,7 @@ static void addonFreeLlamaBackend(Napi::Env env, int* data) {
 
 Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
     exports.DefineProperties({
+        Napi::PropertyDescriptor::Function("markLoaded", markLoaded),
         Napi::PropertyDescriptor::Function("systemInfo", systemInfo),
         Napi::PropertyDescriptor::Function("getSupportsGpuOffloading", addonGetSupportsGpuOffloading),
         Napi::PropertyDescriptor::Function("getSupportsMmap", addonGetSupportsMmap),
diff --git a/llama/addon/globals/addonLog.cpp b/llama/addon/globals/addonLog.cpp
index ae52ae30..c80820a5 100644
--- a/llama/addon/globals/addonLog.cpp
+++ b/llama/addon/globals/addonLog.cpp
@@ -137,3 +137,7 @@ Napi::Value setLoggerLogLevel(const Napi::CallbackInfo& info) {
 
     return info.Env().Undefined();
 }
+
+void addonLog(ggml_log_level level, const std::string text) {
+    addonLlamaCppLogCallback(level, std::string("[addon] " + text + "\n").c_str(), nullptr);
+}
diff --git a/llama/addon/globals/addonLog.h b/llama/addon/globals/addonLog.h
index cc15681e..44c445cd 100644
--- a/llama/addon/globals/addonLog.h
+++ b/llama/addon/globals/addonLog.h
@@ -20,3 +20,5 @@ using AddonThreadSafeLogCallbackFunction =
 
 Napi::Value setLogger(const Napi::CallbackInfo& info);
 Napi::Value setLoggerLogLevel(const Napi::CallbackInfo& info);
+
+void addonLog(ggml_log_level level, const std::string text);
diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts
index a1cbefc3..122c87cf 100644
--- a/src/bindings/AddonTypes.ts
+++ b/src/bindings/AddonTypes.ts
@@ -48,6 +48,7 @@ export type BindingModule = {
         acceptGrammarEvaluationStateToken(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): void,
         canBeNextTokenForGrammarEvaluationState(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): boolean
     },
+    markLoaded(): boolean,
     systemInfo(): string,
     getSupportsGpuOffloading(): boolean,
     getSupportsMmap(): boolean,
diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts
index e65060a5..d5725e96 100644
--- a/src/bindings/Llama.ts
+++ b/src/bindings/Llama.ts
@@ -73,7 +73,7 @@ export class Llama {
 
     private constructor({
         bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu,
-        maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator
+        maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit
     }: {
         bindings: BindingModule,
         bindingPath: string,
@@ -94,7 +94,8 @@ export class Llama {
         vramPadding: MemoryReservation,
         ramOrchestrator: MemoryOrchestrator,
         ramPadding: MemoryReservation,
-        swapOrchestrator: MemoryOrchestrator
+        swapOrchestrator: MemoryOrchestrator,
+        skipLlamaInit: boolean
     }) {
         this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
         this._onAddonLog = this._onAddonLog.bind(this);
@@ -106,7 +107,9 @@ export class Llama {
             ? LlamaLogLevel.debug
             : (logLevel ?? LlamaLogLevel.debug);
 
-        if (!this._debug) {
+        const previouslyLoaded = bindings.markLoaded();
+
+        if (!this._debug && (!skipLlamaInit || !previouslyLoaded)) {
             this._bindings.setLogger(this._onAddonLog);
             this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
         }
@@ -576,7 +579,8 @@ export class Llama {
             vramPadding: vramOrchestrator.reserveMemory(0),
             ramOrchestrator,
             ramPadding: resolvedRamPadding,
-            swapOrchestrator
+            swapOrchestrator,
+            skipLlamaInit
         });
 
         if (llama.gpu === false || vramPadding === 0) {
@@ -688,6 +692,8 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil
         return LlamaLogLevel.info;
     else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU"))
         return LlamaLogLevel.info;
+    else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for"))
+        return LlamaLogLevel.info;
 
     return level;
 }
diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts
index fe56d3e4..51e5cdf9 100644
--- a/src/bindings/utils/compileLLamaCpp.ts
+++ b/src/bindings/utils/compileLLamaCpp.ts
@@ -131,12 +131,20 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
                 if (!cmakeCustomOptions.has("GGML_CCACHE"))
                     cmakeCustomOptions.set("GGML_CCACHE", "OFF");
 
-                if (!cmakeCustomOptions.has("LLAMA_CURL") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_CURL"))) {
-                    cmakeCustomOptions.set("LLAMA_CURL", "OFF");
+                // avoid linking to extra libraries that we don't use
+                {
+                    if (!cmakeCustomOptions.has("LLAMA_CURL") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_CURL")))
+                        cmakeCustomOptions.set("LLAMA_CURL", "OFF");
 
-                    // avoid linking to extra libraries that we don't use
-                    if (!cmakeCustomOptions.has("LLAMA_OPENSSL"))
-                        cmakeCustomOptions.set("LLAMA_OPENSSL", "OFF");
+                    if (!cmakeCustomOptions.has("LLAMA_HTTPLIB") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_HTTPLIB"))) {
+                        cmakeCustomOptions.set("LLAMA_HTTPLIB", "OFF");
+
+                        if (!cmakeCustomOptions.has("LLAMA_BUILD_BORINGSSL"))
+                            cmakeCustomOptions.set("LLAMA_BUILD_BORINGSSL", "OFF");
+
+                        if (!cmakeCustomOptions.has("LLAMA_OPENSSL"))
+                            cmakeCustomOptions.set("LLAMA_OPENSSL", "OFF");
+                    }
                 }
 
                 if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP"))
diff --git a/src/bindings/utils/getLlamaWithoutBackend.ts b/src/bindings/utils/getLlamaWithoutBackend.ts
index 992cdf6a..f70a087e 100644
--- a/src/bindings/utils/getLlamaWithoutBackend.ts
+++ b/src/bindings/utils/getLlamaWithoutBackend.ts
@@ -16,16 +16,28 @@ export async function getLlamaWithoutBackend() {
         if (sharedLlamaWithoutBackend != null)
             return sharedLlamaWithoutBackend;
 
-        sharedLlamaWithoutBackend = await getLlamaForOptions({
-            gpu: false,
-            progressLogs: false,
-            logLevel: LlamaLogLevel.error,
-            build: "never",
-            usePrebuiltBinaries: true,
-            vramPadding: 0
-        }, {
-            skipLlamaInit: true
-        });
+        try {
+            sharedLlamaWithoutBackend = await getLlamaForOptions({
+                gpu: false,
+                progressLogs: false,
+                logLevel: LlamaLogLevel.error,
+                build: "never",
+                usePrebuiltBinaries: true,
+                vramPadding: 0
+            }, {
+                skipLlamaInit: true
+            });
+        } catch (err) {
+            sharedLlamaWithoutBackend = await getLlamaForOptions({
+                progressLogs: false,
+                logLevel: LlamaLogLevel.error,
+                build: "never",
+                usePrebuiltBinaries: true,
+                vramPadding: 0
+            }, {
+                skipLlamaInit: true
+            });
+        }
 
         return sharedLlamaWithoutBackend;
     });
diff --git a/src/chatWrappers/QwenChatWrapper.ts b/src/chatWrappers/QwenChatWrapper.ts
index f9fc9027..edcf5635 100644
--- a/src/chatWrappers/QwenChatWrapper.ts
+++ b/src/chatWrappers/QwenChatWrapper.ts
@@ -239,7 +239,9 @@ export class QwenChatWrapper extends ChatWrapper {
             architecture === GgufArchitectureType.qwen2moe ||
             architecture === GgufArchitectureType.qwen2vl ||
             architecture === GgufArchitectureType.qwen3 ||
-            architecture === GgufArchitectureType.qwen3moe
+            architecture === GgufArchitectureType.qwen3moe ||
+            architecture === GgufArchitectureType.qwen3vl ||
+            architecture === GgufArchitectureType.qwen3vlmoe
         );
     }
 
diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
index 7e19aa96..c3bf548c 100644
--- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
+++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts
@@ -21,6 +21,7 @@ import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDes
 import {documentationPageUrls} from "../../../../config.js";
 import {Llama} from "../../../../bindings/Llama.js";
 import {toBytes} from "../../../utils/toBytes.js";
+import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";
 
 type InspectMeasureCommand = {
     modelPath?: string,
@@ -952,6 +953,8 @@ function getContextSizesCheckPlan(trainContextSize: number, tests: number = 10,
         if (size < 2)
             size = 2;
 
+        size = padSafeContextSize(size, "up");
+
         if (res[res.length - 1] === size) {
             shouldStop = true;
             return;
diff --git a/src/cli/commands/source/commands/DownloadCommand.ts b/src/cli/commands/source/commands/DownloadCommand.ts
index 722a99fc..d4e42bab 100644
--- a/src/cli/commands/source/commands/DownloadCommand.ts
+++ b/src/cli/commands/source/commands/DownloadCommand.ts
@@ -271,7 +271,13 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) {
     console.log();
     console.log();
     console.log(`${chalk.yellow("Repo:")} ${repo}`);
-    console.log(`${chalk.yellow("Release:")} ${release}`);
+    console.log(
+        chalk.yellow("Release:") + " " + release + (
+            release === "latest"
+                ? (" " + chalk.gray("(" + githubReleaseTag + ")"))
+                : ""
+        )
+    );
     console.log();
     console.log(chalk.green("Done"));
 }
diff --git a/src/config.ts b/src/config.ts
index b68078ae..cded8eb9 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -125,3 +125,4 @@ export const documentationPageUrls = {
 export const newGithubIssueUrl = "https://github.com/withcatai/node-llama-cpp/issues";
 export const recommendedBaseDockerImage = "node:20";
 export const minAllowedContextSizeInCalculations = 24;
+export const contextSizePad = 256; // source: `GGML_PAD` usage in `llama_context::llama_context` in `llama-context.cpp`
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index deeda246..bffc1ad6 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -22,6 +22,7 @@ import {
 import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js";
 import {LlamaSampler} from "./LlamaSampler.js";
 import {TokenPredictor} from "./TokenPredictor.js";
+import {padSafeContextSize} from "./utils/padSafeContextSize.js";
 import type {Llama} from "../../bindings/Llama.js";
 
 const defaultLoraScale = 1;
@@ -98,12 +99,15 @@ export class LlamaContext {
         if (_model.disposed)
             throw new DisposedError();
 
+        const kvUnified = false;
         this._llama = _model._llama;
         this._model = _model;
         this._backendContextDisposeGuard = new DisposeGuard([this._model._backendModelDisposeGuard]);
         this._modelPreventDisposalHandle = this._model._backendModelDisposeGuard.createPreventDisposalHandle();
         this._totalSequences = Math.max(1, Math.floor(sequences));
-        this._contextSize = Math.max(2, contextSize);
+        this._contextSize = kvUnified
+            ? Math.floor(padSafeContextSize(Math.max(2, contextSize) * this._totalSequences, "up") / this._totalSequences)
+            : padSafeContextSize(Math.max(2, contextSize), "up");
         this._batchSize = Math.max(batchSize, this._totalSequences);
         this._flashAttention = flashAttention;
         this._idealThreads = typeof threads === "number"
@@ -124,7 +128,7 @@ export class LlamaContext {
         this._performanceTracking = !!performanceTracking;
         this._swaFullCache = !!swaFullCache;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
-            contextSize: this._contextSize * this._totalSequences, // each sequence needs its own <contextSize> of cells
+            contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
             batchSize: this._batchSize + (
                 (!this._swaFullCache && this.model.fileInsights.swaSize != null && this.model.fileInsights.swaSize > 0)
                     ? 1 // +1 to handle edge cases with SWA KV cache
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index 52a18bf9..f8a00ae2 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -28,6 +28,11 @@ export type LlamaContextOptions = {
      * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible
      * up to the size the model was trained on, but at least `min` and at most `max`.
      *
+     * The actual context size may be slightly larger than your request (by up to 256) due to the implementation in `llama.cpp` that
+     * aligns the context size to multiples of 256 for performance reasons.
+     * To check the actual context size that gets created, use the `.contextSize` property
+     * of the created context instance or any of its sequences.
+     *
      * Defaults to `"auto"`.
      */
     contextSize?: "auto" | number | {
diff --git a/src/evaluator/LlamaContext/utils/padSafeContextSize.ts b/src/evaluator/LlamaContext/utils/padSafeContextSize.ts
new file mode 100644
index 00000000..af9a1626
--- /dev/null
+++ b/src/evaluator/LlamaContext/utils/padSafeContextSize.ts
@@ -0,0 +1,20 @@
+import {contextSizePad} from "../../../config.js";
+
+export function padSafeContextSize(value: number, padDirection: "up" | "down", padding: number = contextSizePad) {
+    const paddedSize = ggmlPad(value, padding);
+
+    if (paddedSize === value)
+        return value;
+    else if (padDirection === "up")
+        return paddedSize;
+    else if (padDirection === "down") {
+        const smallerPaddedSize = ggmlPad(value - padding, padding);
+        if (smallerPaddedSize >= padding)
+            return smallerPaddedSize;
+    }
+
+    return paddedSize;
+}
+function ggmlPad(value: number, padding: number): number {
+    return ((value + padding - 1) & ~(padding - 1));
+}
diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index 73eb94a6..86dd8ca4 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -5,6 +5,7 @@ import {GgufFileInfo} from "../types/GgufFileInfoTypes.js";
 import {GgufTensorInfo} from "../types/GgufTensorInfoTypes.js";
 import {GgufArchitectureType} from "../types/GgufMetadataTypes.js";
 import {getReadablePath} from "../../cli/utils/getReadablePath.js";
+import {padSafeContextSize} from "../../evaluator/LlamaContext/utils/padSafeContextSize.js";
 import {GgufInsightsConfigurationResolver} from "./GgufInsightsConfigurationResolver.js";
 import {GgufInsightsTokens} from "./GgufInsightsTokens.js";
 
@@ -211,6 +212,7 @@ export class GgufInsights {
         const llmData = this._ggufFileInfo.architectureMetadata;
         const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? [];
         const slidingWindow = this.swaSize ?? 0;
+        const kvUnified = false;
         const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize &&
             (this.trainContextSize == null || slidingWindow < this.trainContextSize);
         const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture);
@@ -219,10 +221,10 @@ export class GgufInsights {
             : (1 / (swaPattern + (flashAttention ? -0.5 : -1)));
 
         // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp`
-        const kvCachePadding = flashAttention
-            ? 256
-            : 32;
-        const actualContextSize = sequences * contextSize;
+        const kvCachePadding = 1;
+        const actualContextSize = kvUnified
+            ? padSafeContextSize(sequences * contextSize, "up")
+            : sequences * padSafeContextSize(contextSize, "up");
         const kvSize = usingSWA
             ? (
                 (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) +
diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
index cbae45d5..b0179ae9 100644
--- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts
+++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts
@@ -231,10 +231,12 @@ export class GgufInsightsConfigurationResolver {
             useMmap
         });
 
-        let resolvedContextSize = Math.min(
-            this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration,
-            defaultContextSizeForUnfitContextSizeConfiguration
-        );
+        let resolvedContextSize = forceStrictContextSize
+            ? contextSize
+            : Math.min(
+                this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration,
+                defaultContextSizeForUnfitContextSizeConfiguration
+            );
         let contextFitsMemory = false;
 
         try {
@@ -273,6 +275,13 @@ export class GgufInsightsConfigurationResolver {
                 swaFullCache
             });
             contextFitsMemory = true;
+
+            if (forceStrictContextSize && resolvedContextSize < contextSize) {
+                contextFitsMemory = false;
+                resolvedContextSize = contextSize;
+            } else if (forceStrictContextSize && resolvedContextSize > contextSize) {
+                resolvedContextSize = contextSize;
+            }
         } catch (err) {
             if (!(err instanceof InsufficientMemoryError))
                 throw err;
diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts
index 14d08707..ff328737 100644
--- a/src/gguf/types/GgufMetadataTypes.ts
+++ b/src/gguf/types/GgufMetadataTypes.ts
@@ -16,6 +16,7 @@ export const enum GgufArchitectureType {
     nomicBertMoe = "nomic-bert-moe",
     neoBert = "neo-bert",
     jinaBertV2 = "jina-bert-v2",
+    jinaBertV3 = "jina-bert-v3",
     bloom = "bloom",
     stablelm = "stablelm",
     qwen = "qwen",
@@ -24,6 +25,8 @@ export const enum GgufArchitectureType {
     qwen2vl = "qwen2vl",
     qwen3 = "qwen3",
     qwen3moe = "qwen3moe",
+    qwen3vl = "qwen3vl",
+    qwen3vlmoe = "qwen3vlmoe",
     phi2 = "phi2",
     phi3 = "phi3",
     phimoe = "phimoe",
@@ -38,6 +41,7 @@ export const enum GgufArchitectureType {
     gemma2 = "gemma2",
     gemma3 = "gemma3",
     gemma3n = "gemma3n",
+    gemmaEmbedding = "gemma-embedding",
     starcoder2 = "starcoder2",
     mamba = "mamba",
     mamba2 = "mamba2",
@@ -62,6 +66,7 @@ export const enum GgufArchitectureType {
     t5encoder = "t5encoder",
     jais = "jais",
     nemotron = "nemotron",
+    nemotronH = "nemotron_h",
     exaone = "exaone",
     exaone4 = "exaone4",
     rwkv6 = "rwkv6",
@@ -75,6 +80,7 @@ export const enum GgufArchitectureType {
     wavtokenizerDec = "wavtokenizer-dec",
     plm = "plm",
     bailingmoe = "bailingmoe",
+    bailingmoe2 = "bailingmoe2",
     dots1 = "dots1",
     arcee = "arcee",
     ernie4_5 = "ernie4_5",
@@ -84,9 +90,15 @@ export const enum GgufArchitectureType {
     smollm3 = "smollm3",
     gptOss = "gpt-oss",
     lfm2 = "lfm2",
+    lfm2moe = "lfm2moe",
     dream = "dream",
     smallthinker = "smallthinker",
     llada = "llada",
+    lladaMoe = "llada-moe",
+    seedOss = "seed_oss",
+    grovemoe = "grovemoe",
+    apertus = "apertus",
+    cogvlm = "cogvlm",
     clip = "clip",
     unknown = "(unknown)"
 }
diff --git a/templates/electron-typescript-react/src/App/App.css b/templates/electron-typescript-react/src/App/App.css
index 03503d4b..6c920e19 100644
--- a/templates/electron-typescript-react/src/App/App.css
+++ b/templates/electron-typescript-react/src/App/App.css
@@ -16,6 +16,7 @@
     width: 100%;
     min-height: 100%;
     max-width: 1280px;
+    --app-max-width: 1280px;
 
     > .chatHistory {
         margin-bottom: 32px;
diff --git a/templates/electron-typescript-react/src/App/components/Header/Header.css b/templates/electron-typescript-react/src/App/components/Header/Header.css
index 8c1c5fe5..9849d2ea 100644
--- a/templates/electron-typescript-react/src/App/components/Header/Header.css
+++ b/templates/electron-typescript-react/src/App/components/Header/Header.css
@@ -10,6 +10,7 @@
 
     &.main {
         width: calc(100% - 16px * 2);
+        max-width: var(--app-max-width, 1280px);
         position: fixed;
         z-index: 10;
 
diff --git a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
index 582588b7..9453ade4 100644
--- a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
+++ b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css
@@ -12,6 +12,7 @@
 
     &.main {
         width: calc(100% - 16px * 2);
+        max-width: var(--app-max-width, 1280px);
         position: fixed;
         background-color: var(--panel-background-color);
         border-radius: 12px;
diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
index 3ccf0673..2e6067ee 100644
--- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts
@@ -114,7 +114,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7717");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7680");
                     }
                     {
                         const res = await resolveGpuLayers(0, {
@@ -151,7 +151,7 @@ describe("functionary", () => {
                             freeSwap: s1GB * 1
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     {
                         const res = await resolveGpuLayers(0, {
@@ -255,7 +255,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 4.5
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4011");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3840");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -318,7 +318,7 @@ describe("functionary", () => {
                             ignoreMemorySafetyChecks: true
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7717");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7680");
                     }
                 });
 
@@ -343,7 +343,7 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 7.3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1757");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1536");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
@@ -354,7 +354,7 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 5.3
                         });
                         expect(res.gpuLayers).to.eql(16);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5502");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5376");
                     }
                     try {
                         await resolveGpuLayers(16, {
@@ -409,7 +409,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4441");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4352");
                     }
                     {
                         const res = await resolveGpuLayers(16, {
@@ -422,7 +422,7 @@ describe("functionary", () => {
                             ignoreMemorySafetyChecks: true
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                 });
 
@@ -608,7 +608,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     {
                         const res = await resolveGpuLayers(32, {
@@ -619,7 +619,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1164");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1024");
                     }
                     {
                         const res = await resolveGpuLayers(32, {
@@ -761,7 +761,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1164");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1024");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -772,7 +772,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -783,7 +783,7 @@ describe("functionary", () => {
                             llamaGpu: false
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7717");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7680");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -795,7 +795,7 @@ describe("functionary", () => {
                             ignoreMemorySafetyChecks: true
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7717");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7680");
                     }
                 });
 
@@ -809,7 +809,7 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6248");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6144");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -820,7 +820,7 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2972");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2816");
                     }
                     {
                         const res = await resolveGpuLayers(33, {
@@ -831,7 +831,7 @@ describe("functionary", () => {
                             unifiedMemorySize: s1GB * 6
                         });
                         expect(res.gpuLayers).to.eql(33);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("1333");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("1280");
                     }
                     try {
                         await resolveGpuLayers(33, {
@@ -908,7 +908,7 @@ describe("functionary", () => {
                         freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("471");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("458");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -918,7 +918,7 @@ describe("functionary", () => {
                         freeRam: s1GB * 1
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("895");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("768");
                 }
             });
 
@@ -962,7 +962,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7471");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7424");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1062,7 +1062,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4718");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4608");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1072,7 +1072,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7995");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7936");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1095,7 +1095,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1105,7 +1105,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("0");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1115,7 +1115,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("4");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("5438");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("5376");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1125,7 +1125,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7471");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7424");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1225,7 +1225,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4718");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("4608");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1235,7 +1235,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7995");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7936");
                     }
                     {
                         const res = await resolveGpuLayers("auto", {
@@ -1349,7 +1349,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4011");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3840");
                     }
                 });
 
@@ -1362,7 +1362,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     {
                         const res = await resolveGpuLayers({min: 0, max: 4}, {
@@ -1372,7 +1372,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 5
                         });
                         expect(res.gpuLayers).to.eql(0);
-                        expect(res.contextSize).to.toMatchInlineSnapshot("2256");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("2048");
                     }
                     try {
                         await resolveGpuLayers({min: 2}, {
@@ -1451,7 +1451,7 @@ describe("functionary", () => {
                         expect(res.gpuLayers).to.be.gte(16);
                         expect(res.gpuLayers).to.be.lte(24);
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("4011");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("3840");
                     }
                 });
             });
@@ -1480,7 +1480,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6531");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6400");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1492,7 +1492,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 8
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7471");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7424");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1569,7 +1569,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 7
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("6531");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("6400");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
@@ -1581,7 +1581,7 @@ describe("functionary", () => {
                             freeRam: s1GB * 7
                         });
                         expect(res.gpuLayers).to.toMatchInlineSnapshot("7");
-                        expect(res.contextSize).to.toMatchInlineSnapshot("7471");
+                        expect(res.contextSize).to.toMatchInlineSnapshot("7424");
                         expect(res.contextSize).to.be.gte(contextSize);
                     }
                     {
diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts
index eec1916d..72e6a94f 100644
--- a/test/modelDependent/llama3.2/sequenceState.test.ts
+++ b/test/modelDependent/llama3.2/sequenceState.test.ts
@@ -195,13 +195,14 @@ describe("llama 3.2", () => {
                 });
                 const contextSequence1 = context1.getSequence();
                 const contextSequence2 = context2.getSequence();
+                expect(context2.contextSize).to.eql(256); // the context is actually bigger due to `llama.cpp`'s padding
 
                 const chatSession1 = new LlamaChatSession({
                     contextSequence: contextSequence1
                 });
 
-                const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4});
-                expect(res1).to.toMatchInlineSnapshot("\"That's a clever\"");
+                const res1 = await chatSession1.prompt("Remember: locks are not doors. Also, write a long poem about it", {maxTokens: 154});
+                expect(res1).toMatch(/^(A clever reminder indeed.|A wise phrase to ponder)/);
 
 
                 const stateFile1Path = await getTempTestFilePath("state1");
@@ -211,12 +212,12 @@ describe("llama 3.2", () => {
                 const contextSequence1TokensState = contextSequence1.tokenMeter.getState();
 
                 expect(contextSequence1.contextTokens).to.eql(state1Tokens);
-                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103");
-                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.27MB"');
+                expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("262");
+                expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"28.66MB"');
                 expect(contextSequence1TokensState).to.toMatchInlineSnapshot(`
                   {
-                    "usedInputTokens": 99,
-                    "usedOutputTokens": 4,
+                    "usedInputTokens": 108,
+                    "usedOutputTokens": 154,
                   }
                 `);
 
diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
index 39a722b0..4e013171 100644
--- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
+++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts
@@ -111,7 +111,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8061");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7936");
                 }
                 try {
                     await resolveGpuLayers(16, {
@@ -142,7 +142,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("136");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("36");
                 }
 
 
@@ -174,7 +174,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11347");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11264");
                 }
                 try {
                     await resolveGpuLayers(32, {
@@ -192,7 +192,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(32);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("47");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
                 }
 
                 {
@@ -223,12 +223,12 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11347");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11264");
                 }
                 try {
                     await resolveGpuLayers(33, {
                         totalVram: s1GB * 6,
-                        freeVram: s1GB * 0.2
+                        freeVram: s1GB * 0.4
                     });
                     expect.unreachable("Should have thrown an error");
                 } catch (err) {
@@ -241,7 +241,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("47");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
                 }
 
                 {
@@ -303,7 +303,7 @@ describe("stableCode", () => {
                         ignoreMemorySafetyChecks: true
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("47");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("null");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -311,7 +311,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5886");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5632");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -319,7 +319,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.4
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6978");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6912");
                 }
                 {
                     const res = await resolveGpuLayers("max", {
@@ -327,7 +327,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.eql(33);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8070");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7936");
                 }
             });
 
@@ -346,7 +346,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 0.4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("1");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10841");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("10752");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -362,7 +362,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 1.4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("5");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8361");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -370,7 +370,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 2.4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("1517");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("1280");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -378,7 +378,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.1
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3428");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3328");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -386,7 +386,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("3974");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("3840");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -394,7 +394,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("4520");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("4352");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -402,7 +402,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 3.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5340");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5120");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -410,7 +410,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5886");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5632");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -418,7 +418,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.3
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("6705");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("6656");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -426,7 +426,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.5
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("7251");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7168");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -434,7 +434,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8070");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7936");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -442,7 +442,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.2
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9163");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8960");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -450,7 +450,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 5.8
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("10801");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("10752");
                 }
                 {
                     const res = await resolveGpuLayers("auto", {
@@ -458,7 +458,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 6
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("11347");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("11264");
                 }
             });
 
@@ -504,7 +504,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.eql(16);
-                    expect(res.contextSize).to.toMatchInlineSnapshot("13252");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("13056");
                 }
                 try {
                     await resolveGpuLayers({min: 16}, {
@@ -522,7 +522,7 @@ describe("stableCode", () => {
                     });
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5886");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5632");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -532,7 +532,7 @@ describe("stableCode", () => {
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("23");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8248");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8192");
                 }
                 {
                     const res = await resolveGpuLayers({min: 16, max: 24}, {
@@ -542,7 +542,7 @@ describe("stableCode", () => {
                     expect(res.gpuLayers).to.be.gte(16);
                     expect(res.gpuLayers).to.be.lte(24);
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("16");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("8061");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("7936");
                 }
             });
 
@@ -565,7 +565,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("33");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5886");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5632");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -575,7 +575,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 1
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("3");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("5921");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("5888");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {
@@ -585,7 +585,7 @@ describe("stableCode", () => {
                         freeVram: s1GB * 4
                     });
                     expect(res.gpuLayers).to.toMatchInlineSnapshot("21");
-                    expect(res.contextSize).to.toMatchInlineSnapshot("9206");
+                    expect(res.contextSize).to.toMatchInlineSnapshot("8960");
                     expect(res.contextSize).to.be.gte(contextSize);
                 }
                 {