diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 555e62a9..30aa2d9e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -70,10 +70,10 @@ jobs: os: ubuntu-22.04 artifact: "linux-2" - name: "macOS x64" - os: macos-13 + os: macos-15-intel artifact: "mac-x64" - name: "macOS arm64" - os: macos-14 + os: macos-26 artifact: "mac-arm64" steps: @@ -194,10 +194,11 @@ jobs: sudo apt install vulkan-sdk - name: Install dependencies on macOS - if: matrix.config.name == 'macOS' + if: matrix.config.name == 'macOS x64' || matrix.config.name == 'macOS arm64' run: | brew install cmake ninja alias make=cmake + cmake --version - name: Setup & Build id: build @@ -461,7 +462,7 @@ jobs: model-dependent-tests: name: Model dependent tests - runs-on: macos-13 + runs-on: macos-26 env: NODE_LLAMA_CPP_GPU: false needs: @@ -490,7 +491,6 @@ jobs: # sudo apt-get install ninja-build cmake - name: Install dependencies on macOS - if: matrix.config.name == 'macOS' run: | brew install cmake ninja alias make=cmake @@ -602,10 +602,8 @@ jobs: - name: Release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - NPM_TOKEN: ${{ secrets.NPM_TOKEN }} GH_RELEASE_REF: ${{ github.ref }} run: | - echo "//registry.npmjs.org/:_authToken=\${NPM_TOKEN}" > ~/.npmrc export DRY_RUN_RESULT_FILE_PATH="$(pwd)/semanticReleaseDryRunReleaseResult.json" git apply --ignore-whitespace ./scripts/patches/@semantic-release+github+11.0.0.patch @@ -638,13 +636,10 @@ jobs: - name: Release `create-node-llama-cpp` module if: steps.set-npm-url.outputs.npm-url != '' env: - NPM_TOKEN: ${{ secrets.NPM_TOKEN }} GH_RELEASE_REF: ${{ github.ref }} run: | cd packages/create-node-llama-cpp - echo "//registry.npmjs.org/:_authToken=\${NPM_TOKEN}" > ~/.npmrc - if [ "$GH_RELEASE_REF" == "refs/heads/beta" ]; then npm publish --tag beta else @@ -682,7 +677,7 @@ jobs: - name: "Ubuntu" os: ubuntu-22.04 - name: "macOS" - os: macos-13 + os: macos-15-intel steps: - uses: actions/checkout@v4 diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index d1b1052b..36828590 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -2,6 +2,7 @@ #include #include #include "common/common.h" +#include "llama-vocab.h" #include "llama.h" #include "addonGlobals.h" @@ -345,8 +346,14 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker { } } - sampler->acceptToken(new_token_id); - result = new_token_id; + try { + sampler->acceptToken(new_token_id); + result = new_token_id; + } catch (const std::exception& e) { + SetError(std::string("Failed to accept token in sampler: ") + e.what()); + } catch(...) { + SetError("Unknown error when calling \"acceptToken\""); + } } void OnOK() { Napi::Number resultToken; diff --git a/llama/addon/AddonGrammar.cpp b/llama/addon/AddonGrammar.cpp index 3f061f54..92247437 100644 --- a/llama/addon/AddonGrammar.cpp +++ b/llama/addon/AddonGrammar.cpp @@ -49,7 +49,15 @@ Napi::Value AddonGrammar::isTextCompatible(const Napi::CallbackInfo& info) { llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(parsed_grammar); for (const auto & cpt : cpts) { - llama_grammar_accept(parsed_grammar, cpt); + try { + llama_grammar_accept(parsed_grammar, cpt); + } catch (const std::exception & e) { + llama_grammar_free_impl(parsed_grammar); + return Napi::Boolean::New(info.Env(), false); + } catch (...) { + llama_grammar_free_impl(parsed_grammar); + return Napi::Boolean::New(info.Env(), false); + } if (stacks_cur.empty()) { // no stacks means that the grammar failed to match at this point diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp index 12c9a1ab..fc58de6f 100644 --- a/llama/addon/AddonSampler.cpp +++ b/llama/addon/AddonSampler.cpp @@ -1,5 +1,7 @@ #include #include "common/common.h" +#include "globals/addonLog.h" +#include "ggml.h" #include "llama.h" #include "AddonGrammarEvaluationState.h" @@ -449,7 +451,15 @@ Napi::Value AddonSampler::AcceptGrammarEvaluationStateToken(const Napi::Callback llama_token tokenId = info[1].As().Int32Value(); if ((grammar_evaluation_state)->sampler != nullptr) { - llama_sampler_accept((grammar_evaluation_state)->sampler, tokenId); + try { + llama_sampler_accept((grammar_evaluation_state)->sampler, tokenId); + } catch (const std::exception & e) { + Napi::Error::New(info.Env(), std::string("Failed to accept token in grammar sampler: ") + e.what()).ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } catch (...) { + Napi::Error::New(info.Env(), "Failed to accept token in grammar sampler").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } } return info.Env().Undefined(); @@ -465,7 +475,14 @@ Napi::Value AddonSampler::CanBeNextTokenForGrammarEvaluationState(const Napi::Ca candidates.emplace_back(llama_token_data { tokenId, 1, 0.0f }); llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - llama_sampler_apply((grammar_evaluation_state)->sampler, &candidates_p); + try { + llama_sampler_apply((grammar_evaluation_state)->sampler, &candidates_p); + } catch (const std::exception & e) { + addonLog(GGML_LOG_LEVEL_DEBUG, std::string("Failed to apply grammar sampler: ") + e.what()); + return Napi::Boolean::New(info.Env(), false); + } catch (...) { + return Napi::Boolean::New(info.Env(), false); + } if (candidates_p.size == 0 || candidates_p.data[0].logit == -INFINITY) { return Napi::Boolean::New(info.Env(), false); diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp index a01a987e..66e45a12 100644 --- a/llama/addon/addon.cpp +++ b/llama/addon/addon.cpp @@ -11,6 +11,8 @@ #include "globals/getSwapInfo.h" #include "globals/getMemoryInfo.h" +#include + bool backendInitialized = false; bool backendDisposed = false; @@ -226,6 +228,11 @@ Napi::Value addonSetNuma(const Napi::CallbackInfo& info) { return info.Env().Undefined(); } +Napi::Value markLoaded(const Napi::CallbackInfo& info) { + static std::atomic_bool loaded = false; + return Napi::Boolean::New(info.Env(), loaded.exchange(true)); +} + Napi::Value addonInit(const Napi::CallbackInfo& info) { if (backendInitialized) { Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env()); @@ -266,6 +273,7 @@ static void addonFreeLlamaBackend(Napi::Env env, int* data) { Napi::Object registerCallback(Napi::Env env, Napi::Object exports) { exports.DefineProperties({ + Napi::PropertyDescriptor::Function("markLoaded", markLoaded), Napi::PropertyDescriptor::Function("systemInfo", systemInfo), Napi::PropertyDescriptor::Function("getSupportsGpuOffloading", addonGetSupportsGpuOffloading), Napi::PropertyDescriptor::Function("getSupportsMmap", addonGetSupportsMmap), diff --git a/llama/addon/globals/addonLog.cpp b/llama/addon/globals/addonLog.cpp index ae52ae30..c80820a5 100644 --- a/llama/addon/globals/addonLog.cpp +++ b/llama/addon/globals/addonLog.cpp @@ -137,3 +137,7 @@ Napi::Value setLoggerLogLevel(const Napi::CallbackInfo& info) { return info.Env().Undefined(); } + +void addonLog(ggml_log_level level, const std::string text) { + addonLlamaCppLogCallback(level, std::string("[addon] " + text + "\n").c_str(), nullptr); +} diff --git a/llama/addon/globals/addonLog.h b/llama/addon/globals/addonLog.h index cc15681e..44c445cd 100644 --- a/llama/addon/globals/addonLog.h +++ b/llama/addon/globals/addonLog.h @@ -20,3 +20,5 @@ using AddonThreadSafeLogCallbackFunction = Napi::Value setLogger(const Napi::CallbackInfo& info); Napi::Value setLoggerLogLevel(const Napi::CallbackInfo& info); + +void addonLog(ggml_log_level level, const std::string text); diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index a1cbefc3..122c87cf 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -48,6 +48,7 @@ export type BindingModule = { acceptGrammarEvaluationStateToken(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): void, canBeNextTokenForGrammarEvaluationState(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): boolean }, + markLoaded(): boolean, systemInfo(): string, getSupportsGpuOffloading(): boolean, getSupportsMmap(): boolean, diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts index e65060a5..d5725e96 100644 --- a/src/bindings/Llama.ts +++ b/src/bindings/Llama.ts @@ -73,7 +73,7 @@ export class Llama { private constructor({ bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu, - maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator + maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit }: { bindings: BindingModule, bindingPath: string, @@ -94,7 +94,8 @@ export class Llama { vramPadding: MemoryReservation, ramOrchestrator: MemoryOrchestrator, ramPadding: MemoryReservation, - swapOrchestrator: MemoryOrchestrator + swapOrchestrator: MemoryOrchestrator, + skipLlamaInit: boolean }) { this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this); this._onAddonLog = this._onAddonLog.bind(this); @@ -106,7 +107,9 @@ export class Llama { ? LlamaLogLevel.debug : (logLevel ?? LlamaLogLevel.debug); - if (!this._debug) { + const previouslyLoaded = bindings.markLoaded(); + + if (!this._debug && (!skipLlamaInit || !previouslyLoaded)) { this._bindings.setLogger(this._onAddonLog); this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel); } @@ -576,7 +579,8 @@ export class Llama { vramPadding: vramOrchestrator.reserveMemory(0), ramOrchestrator, ramPadding: resolvedRamPadding, - swapOrchestrator + swapOrchestrator, + skipLlamaInit }); if (llama.gpu === false || vramPadding === 0) { @@ -688,6 +692,8 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil return LlamaLogLevel.info; else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU")) return LlamaLogLevel.info; + else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for")) + return LlamaLogLevel.info; return level; } diff --git a/src/bindings/utils/compileLLamaCpp.ts b/src/bindings/utils/compileLLamaCpp.ts index fe56d3e4..51e5cdf9 100644 --- a/src/bindings/utils/compileLLamaCpp.ts +++ b/src/bindings/utils/compileLLamaCpp.ts @@ -131,12 +131,20 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions if (!cmakeCustomOptions.has("GGML_CCACHE")) cmakeCustomOptions.set("GGML_CCACHE", "OFF"); - if (!cmakeCustomOptions.has("LLAMA_CURL") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_CURL"))) { - cmakeCustomOptions.set("LLAMA_CURL", "OFF"); + // avoid linking to extra libraries that we don't use + { + if (!cmakeCustomOptions.has("LLAMA_CURL") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_CURL"))) + cmakeCustomOptions.set("LLAMA_CURL", "OFF"); - // avoid linking to extra libraries that we don't use - if (!cmakeCustomOptions.has("LLAMA_OPENSSL")) - cmakeCustomOptions.set("LLAMA_OPENSSL", "OFF"); + if (!cmakeCustomOptions.has("LLAMA_HTTPLIB") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_HTTPLIB"))) { + cmakeCustomOptions.set("LLAMA_HTTPLIB", "OFF"); + + if (!cmakeCustomOptions.has("LLAMA_BUILD_BORINGSSL")) + cmakeCustomOptions.set("LLAMA_BUILD_BORINGSSL", "OFF"); + + if (!cmakeCustomOptions.has("LLAMA_OPENSSL")) + cmakeCustomOptions.set("LLAMA_OPENSSL", "OFF"); + } } if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP")) diff --git a/src/bindings/utils/getLlamaWithoutBackend.ts b/src/bindings/utils/getLlamaWithoutBackend.ts index 992cdf6a..f70a087e 100644 --- a/src/bindings/utils/getLlamaWithoutBackend.ts +++ b/src/bindings/utils/getLlamaWithoutBackend.ts @@ -16,16 +16,28 @@ export async function getLlamaWithoutBackend() { if (sharedLlamaWithoutBackend != null) return sharedLlamaWithoutBackend; - sharedLlamaWithoutBackend = await getLlamaForOptions({ - gpu: false, - progressLogs: false, - logLevel: LlamaLogLevel.error, - build: "never", - usePrebuiltBinaries: true, - vramPadding: 0 - }, { - skipLlamaInit: true - }); + try { + sharedLlamaWithoutBackend = await getLlamaForOptions({ + gpu: false, + progressLogs: false, + logLevel: LlamaLogLevel.error, + build: "never", + usePrebuiltBinaries: true, + vramPadding: 0 + }, { + skipLlamaInit: true + }); + } catch (err) { + sharedLlamaWithoutBackend = await getLlamaForOptions({ + progressLogs: false, + logLevel: LlamaLogLevel.error, + build: "never", + usePrebuiltBinaries: true, + vramPadding: 0 + }, { + skipLlamaInit: true + }); + } return sharedLlamaWithoutBackend; }); diff --git a/src/chatWrappers/QwenChatWrapper.ts b/src/chatWrappers/QwenChatWrapper.ts index f9fc9027..edcf5635 100644 --- a/src/chatWrappers/QwenChatWrapper.ts +++ b/src/chatWrappers/QwenChatWrapper.ts @@ -239,7 +239,9 @@ export class QwenChatWrapper extends ChatWrapper { architecture === GgufArchitectureType.qwen2moe || architecture === GgufArchitectureType.qwen2vl || architecture === GgufArchitectureType.qwen3 || - architecture === GgufArchitectureType.qwen3moe + architecture === GgufArchitectureType.qwen3moe || + architecture === GgufArchitectureType.qwen3vl || + architecture === GgufArchitectureType.qwen3vlmoe ); } diff --git a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts index 7e19aa96..c3bf548c 100644 --- a/src/cli/commands/inspect/commands/InspectMeasureCommand.ts +++ b/src/cli/commands/inspect/commands/InspectMeasureCommand.ts @@ -21,6 +21,7 @@ import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDes import {documentationPageUrls} from "../../../../config.js"; import {Llama} from "../../../../bindings/Llama.js"; import {toBytes} from "../../../utils/toBytes.js"; +import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js"; type InspectMeasureCommand = { modelPath?: string, @@ -952,6 +953,8 @@ function getContextSizesCheckPlan(trainContextSize: number, tests: number = 10, if (size < 2) size = 2; + size = padSafeContextSize(size, "up"); + if (res[res.length - 1] === size) { shouldStop = true; return; diff --git a/src/cli/commands/source/commands/DownloadCommand.ts b/src/cli/commands/source/commands/DownloadCommand.ts index 722a99fc..d4e42bab 100644 --- a/src/cli/commands/source/commands/DownloadCommand.ts +++ b/src/cli/commands/source/commands/DownloadCommand.ts @@ -271,7 +271,13 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) { console.log(); console.log(); console.log(`${chalk.yellow("Repo:")} ${repo}`); - console.log(`${chalk.yellow("Release:")} ${release}`); + console.log( + chalk.yellow("Release:") + " " + release + ( + release === "latest" + ? (" " + chalk.gray("(" + githubReleaseTag + ")")) + : "" + ) + ); console.log(); console.log(chalk.green("Done")); } diff --git a/src/config.ts b/src/config.ts index b68078ae..cded8eb9 100644 --- a/src/config.ts +++ b/src/config.ts @@ -125,3 +125,4 @@ export const documentationPageUrls = { export const newGithubIssueUrl = "https://github.com/withcatai/node-llama-cpp/issues"; export const recommendedBaseDockerImage = "node:20"; export const minAllowedContextSizeInCalculations = 24; +export const contextSizePad = 256; // source: `GGML_PAD` usage in `llama_context::llama_context` in `llama-context.cpp` diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index deeda246..bffc1ad6 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -22,6 +22,7 @@ import { import {resolveBatchItemsPrioritizationStrategy} from "./utils/resolveBatchItemsPrioritizationStrategy.js"; import {LlamaSampler} from "./LlamaSampler.js"; import {TokenPredictor} from "./TokenPredictor.js"; +import {padSafeContextSize} from "./utils/padSafeContextSize.js"; import type {Llama} from "../../bindings/Llama.js"; const defaultLoraScale = 1; @@ -98,12 +99,15 @@ export class LlamaContext { if (_model.disposed) throw new DisposedError(); + const kvUnified = false; this._llama = _model._llama; this._model = _model; this._backendContextDisposeGuard = new DisposeGuard([this._model._backendModelDisposeGuard]); this._modelPreventDisposalHandle = this._model._backendModelDisposeGuard.createPreventDisposalHandle(); this._totalSequences = Math.max(1, Math.floor(sequences)); - this._contextSize = Math.max(2, contextSize); + this._contextSize = kvUnified + ? Math.floor(padSafeContextSize(Math.max(2, contextSize) * this._totalSequences, "up") / this._totalSequences) + : padSafeContextSize(Math.max(2, contextSize), "up"); this._batchSize = Math.max(batchSize, this._totalSequences); this._flashAttention = flashAttention; this._idealThreads = typeof threads === "number" @@ -124,7 +128,7 @@ export class LlamaContext { this._performanceTracking = !!performanceTracking; this._swaFullCache = !!swaFullCache; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ - contextSize: this._contextSize * this._totalSequences, // each sequence needs its own of cells + contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own of cells batchSize: this._batchSize + ( (!this._swaFullCache && this.model.fileInsights.swaSize != null && this.model.fileInsights.swaSize > 0) ? 1 // +1 to handle edge cases with SWA KV cache diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index 52a18bf9..f8a00ae2 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -28,6 +28,11 @@ export type LlamaContextOptions = { * - **`{min?: number, max?: number}`** - adapt to the current VRAM state and attemp to set the context size as high as possible * up to the size the model was trained on, but at least `min` and at most `max`. * + * The actual context size may be slightly larger than your request (by up to 256) due to the implementation in `llama.cpp` that + * aligns the context size to multiples of 256 for performance reasons. + * To check the actual context size that gets created, use the `.contextSize` property + * of the created context instance or any of its sequences. + * * Defaults to `"auto"`. */ contextSize?: "auto" | number | { diff --git a/src/evaluator/LlamaContext/utils/padSafeContextSize.ts b/src/evaluator/LlamaContext/utils/padSafeContextSize.ts new file mode 100644 index 00000000..af9a1626 --- /dev/null +++ b/src/evaluator/LlamaContext/utils/padSafeContextSize.ts @@ -0,0 +1,20 @@ +import {contextSizePad} from "../../../config.js"; + +export function padSafeContextSize(value: number, padDirection: "up" | "down", padding: number = contextSizePad) { + const paddedSize = ggmlPad(value, padding); + + if (paddedSize === value) + return value; + else if (padDirection === "up") + return paddedSize; + else if (padDirection === "down") { + const smallerPaddedSize = ggmlPad(value - padding, padding); + if (smallerPaddedSize >= padding) + return smallerPaddedSize; + } + + return paddedSize; +} +function ggmlPad(value: number, padding: number): number { + return ((value + padding - 1) & ~(padding - 1)); +} diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index 73eb94a6..86dd8ca4 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -5,6 +5,7 @@ import {GgufFileInfo} from "../types/GgufFileInfoTypes.js"; import {GgufTensorInfo} from "../types/GgufTensorInfoTypes.js"; import {GgufArchitectureType} from "../types/GgufMetadataTypes.js"; import {getReadablePath} from "../../cli/utils/getReadablePath.js"; +import {padSafeContextSize} from "../../evaluator/LlamaContext/utils/padSafeContextSize.js"; import {GgufInsightsConfigurationResolver} from "./GgufInsightsConfigurationResolver.js"; import {GgufInsightsTokens} from "./GgufInsightsTokens.js"; @@ -211,6 +212,7 @@ export class GgufInsights { const llmData = this._ggufFileInfo.architectureMetadata; const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const slidingWindow = this.swaSize ?? 0; + const kvUnified = false; const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize && (this.trainContextSize == null || slidingWindow < this.trainContextSize); const swaPattern = getSwaPatternForArchitecture(this._ggufFileInfo.metadata?.general?.architecture); @@ -219,10 +221,10 @@ export class GgufInsights { : (1 / (swaPattern + (flashAttention ? -0.5 : -1))); // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp` - const kvCachePadding = flashAttention - ? 256 - : 32; - const actualContextSize = sequences * contextSize; + const kvCachePadding = 1; + const actualContextSize = kvUnified + ? padSafeContextSize(sequences * contextSize, "up") + : sequences * padSafeContextSize(contextSize, "up"); const kvSize = usingSWA ? ( (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) + diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index cbae45d5..b0179ae9 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -231,10 +231,12 @@ export class GgufInsightsConfigurationResolver { useMmap }); - let resolvedContextSize = Math.min( - this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration, - defaultContextSizeForUnfitContextSizeConfiguration - ); + let resolvedContextSize = forceStrictContextSize + ? contextSize + : Math.min( + this.ggufInsights.trainContextSize ?? defaultContextSizeForUnfitContextSizeConfiguration, + defaultContextSizeForUnfitContextSizeConfiguration + ); let contextFitsMemory = false; try { @@ -273,6 +275,13 @@ export class GgufInsightsConfigurationResolver { swaFullCache }); contextFitsMemory = true; + + if (forceStrictContextSize && resolvedContextSize < contextSize) { + contextFitsMemory = false; + resolvedContextSize = contextSize; + } else if (forceStrictContextSize && resolvedContextSize > contextSize) { + resolvedContextSize = contextSize; + } } catch (err) { if (!(err instanceof InsufficientMemoryError)) throw err; diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index 14d08707..ff328737 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -16,6 +16,7 @@ export const enum GgufArchitectureType { nomicBertMoe = "nomic-bert-moe", neoBert = "neo-bert", jinaBertV2 = "jina-bert-v2", + jinaBertV3 = "jina-bert-v3", bloom = "bloom", stablelm = "stablelm", qwen = "qwen", @@ -24,6 +25,8 @@ export const enum GgufArchitectureType { qwen2vl = "qwen2vl", qwen3 = "qwen3", qwen3moe = "qwen3moe", + qwen3vl = "qwen3vl", + qwen3vlmoe = "qwen3vlmoe", phi2 = "phi2", phi3 = "phi3", phimoe = "phimoe", @@ -38,6 +41,7 @@ export const enum GgufArchitectureType { gemma2 = "gemma2", gemma3 = "gemma3", gemma3n = "gemma3n", + gemmaEmbedding = "gemma-embedding", starcoder2 = "starcoder2", mamba = "mamba", mamba2 = "mamba2", @@ -62,6 +66,7 @@ export const enum GgufArchitectureType { t5encoder = "t5encoder", jais = "jais", nemotron = "nemotron", + nemotronH = "nemotron_h", exaone = "exaone", exaone4 = "exaone4", rwkv6 = "rwkv6", @@ -75,6 +80,7 @@ export const enum GgufArchitectureType { wavtokenizerDec = "wavtokenizer-dec", plm = "plm", bailingmoe = "bailingmoe", + bailingmoe2 = "bailingmoe2", dots1 = "dots1", arcee = "arcee", ernie4_5 = "ernie4_5", @@ -84,9 +90,15 @@ export const enum GgufArchitectureType { smollm3 = "smollm3", gptOss = "gpt-oss", lfm2 = "lfm2", + lfm2moe = "lfm2moe", dream = "dream", smallthinker = "smallthinker", llada = "llada", + lladaMoe = "llada-moe", + seedOss = "seed_oss", + grovemoe = "grovemoe", + apertus = "apertus", + cogvlm = "cogvlm", clip = "clip", unknown = "(unknown)" } diff --git a/templates/electron-typescript-react/src/App/App.css b/templates/electron-typescript-react/src/App/App.css index 03503d4b..6c920e19 100644 --- a/templates/electron-typescript-react/src/App/App.css +++ b/templates/electron-typescript-react/src/App/App.css @@ -16,6 +16,7 @@ width: 100%; min-height: 100%; max-width: 1280px; + --app-max-width: 1280px; > .chatHistory { margin-bottom: 32px; diff --git a/templates/electron-typescript-react/src/App/components/Header/Header.css b/templates/electron-typescript-react/src/App/components/Header/Header.css index 8c1c5fe5..9849d2ea 100644 --- a/templates/electron-typescript-react/src/App/components/Header/Header.css +++ b/templates/electron-typescript-react/src/App/components/Header/Header.css @@ -10,6 +10,7 @@ &.main { width: calc(100% - 16px * 2); + max-width: var(--app-max-width, 1280px); position: fixed; z-index: 10; diff --git a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css index 582588b7..9453ade4 100644 --- a/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css +++ b/templates/electron-typescript-react/src/App/components/InputRow/InputRow.css @@ -12,6 +12,7 @@ &.main { width: calc(100% - 16px * 2); + max-width: var(--app-max-width, 1280px); position: fixed; background-color: var(--panel-background-color); border-radius: 12px; diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts index 3ccf0673..2e6067ee 100644 --- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts +++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts @@ -114,7 +114,7 @@ describe("functionary", () => { freeRam: s1GB * 6 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7717"); + expect(res.contextSize).to.toMatchInlineSnapshot("7680"); } { const res = await resolveGpuLayers(0, { @@ -151,7 +151,7 @@ describe("functionary", () => { freeSwap: s1GB * 1 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } { const res = await resolveGpuLayers(0, { @@ -255,7 +255,7 @@ describe("functionary", () => { freeRam: s1GB * 4.5 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("4011"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); } try { await resolveGpuLayers(16, { @@ -318,7 +318,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7717"); + expect(res.contextSize).to.toMatchInlineSnapshot("7680"); } }); @@ -343,7 +343,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 7.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("1757"); + expect(res.contextSize).to.toMatchInlineSnapshot("1536"); } { const res = await resolveGpuLayers(16, { @@ -354,7 +354,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 5.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("5502"); + expect(res.contextSize).to.toMatchInlineSnapshot("5376"); } try { await resolveGpuLayers(16, { @@ -409,7 +409,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("4441"); + expect(res.contextSize).to.toMatchInlineSnapshot("4352"); } { const res = await resolveGpuLayers(16, { @@ -422,7 +422,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } }); @@ -608,7 +608,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } { const res = await resolveGpuLayers(32, { @@ -619,7 +619,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1164"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); } { const res = await resolveGpuLayers(32, { @@ -761,7 +761,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1164"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); } { const res = await resolveGpuLayers(33, { @@ -772,7 +772,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } { const res = await resolveGpuLayers(33, { @@ -783,7 +783,7 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7717"); + expect(res.contextSize).to.toMatchInlineSnapshot("7680"); } { const res = await resolveGpuLayers(33, { @@ -795,7 +795,7 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7717"); + expect(res.contextSize).to.toMatchInlineSnapshot("7680"); } }); @@ -809,7 +809,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6248"); + expect(res.contextSize).to.toMatchInlineSnapshot("6144"); } { const res = await resolveGpuLayers(33, { @@ -820,7 +820,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("2972"); + expect(res.contextSize).to.toMatchInlineSnapshot("2816"); } { const res = await resolveGpuLayers(33, { @@ -831,7 +831,7 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1333"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); } try { await resolveGpuLayers(33, { @@ -908,7 +908,7 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("471"); + expect(res.contextSize).to.toMatchInlineSnapshot("458"); } { const res = await resolveGpuLayers("max", { @@ -918,7 +918,7 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("895"); + expect(res.contextSize).to.toMatchInlineSnapshot("768"); } }); @@ -962,7 +962,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("7471"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); } { const res = await resolveGpuLayers("auto", { @@ -1062,7 +1062,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4718"); + expect(res.contextSize).to.toMatchInlineSnapshot("4608"); } { const res = await resolveGpuLayers("auto", { @@ -1072,7 +1072,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7995"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); } { const res = await resolveGpuLayers("auto", { @@ -1095,7 +1095,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } { const res = await resolveGpuLayers("auto", { @@ -1105,7 +1105,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } { const res = await resolveGpuLayers("auto", { @@ -1115,7 +1115,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("5438"); + expect(res.contextSize).to.toMatchInlineSnapshot("5376"); } { const res = await resolveGpuLayers("auto", { @@ -1125,7 +1125,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("7471"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); } { const res = await resolveGpuLayers("auto", { @@ -1225,7 +1225,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4718"); + expect(res.contextSize).to.toMatchInlineSnapshot("4608"); } { const res = await resolveGpuLayers("auto", { @@ -1235,7 +1235,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7995"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); } { const res = await resolveGpuLayers("auto", { @@ -1349,7 +1349,7 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("4011"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); } }); @@ -1362,7 +1362,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -1372,7 +1372,7 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2256"); + expect(res.contextSize).to.toMatchInlineSnapshot("2048"); } try { await resolveGpuLayers({min: 2}, { @@ -1451,7 +1451,7 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("4011"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); } }); }); @@ -1480,7 +1480,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("6531"); + expect(res.contextSize).to.toMatchInlineSnapshot("6400"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1492,7 +1492,7 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("7471"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1569,7 +1569,7 @@ describe("functionary", () => { freeRam: s1GB * 7 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("6531"); + expect(res.contextSize).to.toMatchInlineSnapshot("6400"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1581,7 +1581,7 @@ describe("functionary", () => { freeRam: s1GB * 7 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("7471"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); expect(res.contextSize).to.be.gte(contextSize); } { diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts index eec1916d..72e6a94f 100644 --- a/test/modelDependent/llama3.2/sequenceState.test.ts +++ b/test/modelDependent/llama3.2/sequenceState.test.ts @@ -195,13 +195,14 @@ describe("llama 3.2", () => { }); const contextSequence1 = context1.getSequence(); const contextSequence2 = context2.getSequence(); + expect(context2.contextSize).to.eql(256); // the context is actually bigger due to `llama.cpp`'s padding const chatSession1 = new LlamaChatSession({ contextSequence: contextSequence1 }); - const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); + const res1 = await chatSession1.prompt("Remember: locks are not doors. Also, write a long poem about it", {maxTokens: 154}); + expect(res1).toMatch(/^(A clever reminder indeed.|A wise phrase to ponder)/); const stateFile1Path = await getTempTestFilePath("state1"); @@ -211,12 +212,12 @@ describe("llama 3.2", () => { const contextSequence1TokensState = contextSequence1.tokenMeter.getState(); expect(contextSequence1.contextTokens).to.eql(state1Tokens); - expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("103"); - expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"11.27MB"'); + expect(contextSequence1.contextTokens.length).toMatchInlineSnapshot("262"); + expect(toBytes((await fs.stat(stateFile1Path)).size)).to.toMatchInlineSnapshot('"28.66MB"'); expect(contextSequence1TokensState).to.toMatchInlineSnapshot(` { - "usedInputTokens": 99, - "usedOutputTokens": 4, + "usedInputTokens": 108, + "usedOutputTokens": 154, } `); diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts index 39a722b0..4e013171 100644 --- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts +++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts @@ -111,7 +111,7 @@ describe("stableCode", () => { freeVram: s1GB * 3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("8061"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); } try { await resolveGpuLayers(16, { @@ -142,7 +142,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("136"); + expect(res.contextSize).to.toMatchInlineSnapshot("36"); } @@ -174,7 +174,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("11347"); + expect(res.contextSize).to.toMatchInlineSnapshot("11264"); } try { await resolveGpuLayers(32, { @@ -192,7 +192,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("47"); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); } { @@ -223,12 +223,12 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("11347"); + expect(res.contextSize).to.toMatchInlineSnapshot("11264"); } try { await resolveGpuLayers(33, { totalVram: s1GB * 6, - freeVram: s1GB * 0.2 + freeVram: s1GB * 0.4 }); expect.unreachable("Should have thrown an error"); } catch (err) { @@ -241,7 +241,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("47"); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); } { @@ -303,7 +303,7 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("47"); + expect(res.contextSize).to.toMatchInlineSnapshot("null"); } { const res = await resolveGpuLayers("max", { @@ -311,7 +311,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("5886"); + expect(res.contextSize).to.toMatchInlineSnapshot("5632"); } { const res = await resolveGpuLayers("max", { @@ -319,7 +319,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6978"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); } { const res = await resolveGpuLayers("max", { @@ -327,7 +327,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("8070"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); } }); @@ -346,7 +346,7 @@ describe("stableCode", () => { freeVram: s1GB * 0.4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); - expect(res.contextSize).to.toMatchInlineSnapshot("10841"); + expect(res.contextSize).to.toMatchInlineSnapshot("10752"); } { const res = await resolveGpuLayers("auto", { @@ -362,7 +362,7 @@ describe("stableCode", () => { freeVram: s1GB * 1.4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("5"); - expect(res.contextSize).to.toMatchInlineSnapshot("8361"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers("auto", { @@ -370,7 +370,7 @@ describe("stableCode", () => { freeVram: s1GB * 2.4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("1517"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); } { const res = await resolveGpuLayers("auto", { @@ -378,7 +378,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3428"); + expect(res.contextSize).to.toMatchInlineSnapshot("3328"); } { const res = await resolveGpuLayers("auto", { @@ -386,7 +386,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3974"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); } { const res = await resolveGpuLayers("auto", { @@ -394,7 +394,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4520"); + expect(res.contextSize).to.toMatchInlineSnapshot("4352"); } { const res = await resolveGpuLayers("auto", { @@ -402,7 +402,7 @@ describe("stableCode", () => { freeVram: s1GB * 3.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5340"); + expect(res.contextSize).to.toMatchInlineSnapshot("5120"); } { const res = await resolveGpuLayers("auto", { @@ -410,7 +410,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5886"); + expect(res.contextSize).to.toMatchInlineSnapshot("5632"); } { const res = await resolveGpuLayers("auto", { @@ -418,7 +418,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("6705"); + expect(res.contextSize).to.toMatchInlineSnapshot("6656"); } { const res = await resolveGpuLayers("auto", { @@ -426,7 +426,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7251"); + expect(res.contextSize).to.toMatchInlineSnapshot("7168"); } { const res = await resolveGpuLayers("auto", { @@ -434,7 +434,7 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("8070"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); } { const res = await resolveGpuLayers("auto", { @@ -442,7 +442,7 @@ describe("stableCode", () => { freeVram: s1GB * 5.2 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("9163"); + expect(res.contextSize).to.toMatchInlineSnapshot("8960"); } { const res = await resolveGpuLayers("auto", { @@ -450,7 +450,7 @@ describe("stableCode", () => { freeVram: s1GB * 5.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("10801"); + expect(res.contextSize).to.toMatchInlineSnapshot("10752"); } { const res = await resolveGpuLayers("auto", { @@ -458,7 +458,7 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("11347"); + expect(res.contextSize).to.toMatchInlineSnapshot("11264"); } }); @@ -504,7 +504,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("13252"); + expect(res.contextSize).to.toMatchInlineSnapshot("13056"); } try { await resolveGpuLayers({min: 16}, { @@ -522,7 +522,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5886"); + expect(res.contextSize).to.toMatchInlineSnapshot("5632"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -532,7 +532,7 @@ describe("stableCode", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("23"); - expect(res.contextSize).to.toMatchInlineSnapshot("8248"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -542,7 +542,7 @@ describe("stableCode", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("8061"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); } }); @@ -565,7 +565,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5886"); + expect(res.contextSize).to.toMatchInlineSnapshot("5632"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -575,7 +575,7 @@ describe("stableCode", () => { freeVram: s1GB * 1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("3"); - expect(res.contextSize).to.toMatchInlineSnapshot("5921"); + expect(res.contextSize).to.toMatchInlineSnapshot("5888"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -585,7 +585,7 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("9206"); + expect(res.contextSize).to.toMatchInlineSnapshot("8960"); expect(res.contextSize).to.be.gte(contextSize); } {