Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 6 additions & 11 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ jobs:
os: ubuntu-22.04
artifact: "linux-2"
- name: "macOS x64"
os: macos-13
os: macos-15-intel
artifact: "mac-x64"
- name: "macOS arm64"
os: macos-14
os: macos-26
artifact: "mac-arm64"

steps:
Expand Down Expand Up @@ -194,10 +194,11 @@ jobs:
sudo apt install vulkan-sdk

- name: Install dependencies on macOS
if: matrix.config.name == 'macOS'
if: matrix.config.name == 'macOS x64' || matrix.config.name == 'macOS arm64'
run: |
brew install cmake ninja
alias make=cmake
cmake --version

- name: Setup & Build
id: build
Expand Down Expand Up @@ -461,7 +462,7 @@ jobs:

model-dependent-tests:
name: Model dependent tests
runs-on: macos-13
runs-on: macos-26
env:
NODE_LLAMA_CPP_GPU: false
needs:
Expand Down Expand Up @@ -490,7 +491,6 @@ jobs:
# sudo apt-get install ninja-build cmake

- name: Install dependencies on macOS
if: matrix.config.name == 'macOS'
run: |
brew install cmake ninja
alias make=cmake
Expand Down Expand Up @@ -602,10 +602,8 @@ jobs:
- name: Release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
GH_RELEASE_REF: ${{ github.ref }}
run: |
echo "//registry.npmjs.org/:_authToken=\${NPM_TOKEN}" > ~/.npmrc
export DRY_RUN_RESULT_FILE_PATH="$(pwd)/semanticReleaseDryRunReleaseResult.json"

git apply --ignore-whitespace ./scripts/patches/@semantic-release+github+11.0.0.patch
Expand Down Expand Up @@ -638,13 +636,10 @@ jobs:
- name: Release `create-node-llama-cpp` module
if: steps.set-npm-url.outputs.npm-url != ''
env:
NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
GH_RELEASE_REF: ${{ github.ref }}
run: |
cd packages/create-node-llama-cpp

echo "//registry.npmjs.org/:_authToken=\${NPM_TOKEN}" > ~/.npmrc

if [ "$GH_RELEASE_REF" == "refs/heads/beta" ]; then
npm publish --tag beta
else
Expand Down Expand Up @@ -682,7 +677,7 @@ jobs:
- name: "Ubuntu"
os: ubuntu-22.04
- name: "macOS"
os: macos-13
os: macos-15-intel

steps:
- uses: actions/checkout@v4
Expand Down
11 changes: 9 additions & 2 deletions llama/addon/AddonContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <algorithm>
#include <cmath>
#include "common/common.h"
#include "llama-vocab.h"
#include "llama.h"

#include "addonGlobals.h"
Expand Down Expand Up @@ -345,8 +346,14 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker {
}
}

sampler->acceptToken(new_token_id);
result = new_token_id;
try {
sampler->acceptToken(new_token_id);
result = new_token_id;
} catch (const std::exception& e) {
SetError(std::string("Failed to accept token in sampler: ") + e.what());
} catch(...) {
SetError("Unknown error when calling \"acceptToken\"");
}
}
void OnOK() {
Napi::Number resultToken;
Expand Down
10 changes: 9 additions & 1 deletion llama/addon/AddonGrammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,15 @@ Napi::Value AddonGrammar::isTextCompatible(const Napi::CallbackInfo& info) {
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(parsed_grammar);

for (const auto & cpt : cpts) {
llama_grammar_accept(parsed_grammar, cpt);
try {
llama_grammar_accept(parsed_grammar, cpt);
} catch (const std::exception & e) {
llama_grammar_free_impl(parsed_grammar);
return Napi::Boolean::New(info.Env(), false);
} catch (...) {
llama_grammar_free_impl(parsed_grammar);
return Napi::Boolean::New(info.Env(), false);
}

if (stacks_cur.empty()) {
// no stacks means that the grammar failed to match at this point
Expand Down
21 changes: 19 additions & 2 deletions llama/addon/AddonSampler.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include <cmath>
#include "common/common.h"
#include "globals/addonLog.h"
#include "ggml.h"
#include "llama.h"

#include "AddonGrammarEvaluationState.h"
Expand Down Expand Up @@ -449,7 +451,15 @@ Napi::Value AddonSampler::AcceptGrammarEvaluationStateToken(const Napi::Callback
llama_token tokenId = info[1].As<Napi::Number>().Int32Value();

if ((grammar_evaluation_state)->sampler != nullptr) {
llama_sampler_accept((grammar_evaluation_state)->sampler, tokenId);
try {
llama_sampler_accept((grammar_evaluation_state)->sampler, tokenId);
} catch (const std::exception & e) {
Napi::Error::New(info.Env(), std::string("Failed to accept token in grammar sampler: ") + e.what()).ThrowAsJavaScriptException();
return info.Env().Undefined();
} catch (...) {
Napi::Error::New(info.Env(), "Failed to accept token in grammar sampler").ThrowAsJavaScriptException();
return info.Env().Undefined();
}
}

return info.Env().Undefined();
Expand All @@ -465,7 +475,14 @@ Napi::Value AddonSampler::CanBeNextTokenForGrammarEvaluationState(const Napi::Ca
candidates.emplace_back(llama_token_data { tokenId, 1, 0.0f });

llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sampler_apply((grammar_evaluation_state)->sampler, &candidates_p);
try {
llama_sampler_apply((grammar_evaluation_state)->sampler, &candidates_p);
} catch (const std::exception & e) {
addonLog(GGML_LOG_LEVEL_DEBUG, std::string("Failed to apply grammar sampler: ") + e.what());
return Napi::Boolean::New(info.Env(), false);
} catch (...) {
return Napi::Boolean::New(info.Env(), false);
}

if (candidates_p.size == 0 || candidates_p.data[0].logit == -INFINITY) {
return Napi::Boolean::New(info.Env(), false);
Expand Down
8 changes: 8 additions & 0 deletions llama/addon/addon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include "globals/getSwapInfo.h"
#include "globals/getMemoryInfo.h"

#include <atomic>

bool backendInitialized = false;
bool backendDisposed = false;

Expand Down Expand Up @@ -226,6 +228,11 @@ Napi::Value addonSetNuma(const Napi::CallbackInfo& info) {
return info.Env().Undefined();
}

Napi::Value markLoaded(const Napi::CallbackInfo& info) {
static std::atomic_bool loaded = false;
return Napi::Boolean::New(info.Env(), loaded.exchange(true));
}

Napi::Value addonInit(const Napi::CallbackInfo& info) {
if (backendInitialized) {
Napi::Promise::Deferred deferred = Napi::Promise::Deferred::New(info.Env());
Expand Down Expand Up @@ -266,6 +273,7 @@ static void addonFreeLlamaBackend(Napi::Env env, int* data) {

Napi::Object registerCallback(Napi::Env env, Napi::Object exports) {
exports.DefineProperties({
Napi::PropertyDescriptor::Function("markLoaded", markLoaded),
Napi::PropertyDescriptor::Function("systemInfo", systemInfo),
Napi::PropertyDescriptor::Function("getSupportsGpuOffloading", addonGetSupportsGpuOffloading),
Napi::PropertyDescriptor::Function("getSupportsMmap", addonGetSupportsMmap),
Expand Down
4 changes: 4 additions & 0 deletions llama/addon/globals/addonLog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,7 @@ Napi::Value setLoggerLogLevel(const Napi::CallbackInfo& info) {

return info.Env().Undefined();
}

void addonLog(ggml_log_level level, const std::string text) {
addonLlamaCppLogCallback(level, std::string("[addon] " + text + "\n").c_str(), nullptr);
}
2 changes: 2 additions & 0 deletions llama/addon/globals/addonLog.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ using AddonThreadSafeLogCallbackFunction =

Napi::Value setLogger(const Napi::CallbackInfo& info);
Napi::Value setLoggerLogLevel(const Napi::CallbackInfo& info);

void addonLog(ggml_log_level level, const std::string text);
1 change: 1 addition & 0 deletions src/bindings/AddonTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ export type BindingModule = {
acceptGrammarEvaluationStateToken(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): void,
canBeNextTokenForGrammarEvaluationState(grammarEvaluationState: AddonGrammarEvaluationState, token: Token): boolean
},
markLoaded(): boolean,
systemInfo(): string,
getSupportsGpuOffloading(): boolean,
getSupportsMmap(): boolean,
Expand Down
14 changes: 10 additions & 4 deletions src/bindings/Llama.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ export class Llama {

private constructor({
bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, numa, buildGpu,
maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator
maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit
}: {
bindings: BindingModule,
bindingPath: string,
Expand All @@ -94,7 +94,8 @@ export class Llama {
vramPadding: MemoryReservation,
ramOrchestrator: MemoryOrchestrator,
ramPadding: MemoryReservation,
swapOrchestrator: MemoryOrchestrator
swapOrchestrator: MemoryOrchestrator,
skipLlamaInit: boolean
}) {
this._dispatchPendingLogMicrotask = this._dispatchPendingLogMicrotask.bind(this);
this._onAddonLog = this._onAddonLog.bind(this);
Expand All @@ -106,7 +107,9 @@ export class Llama {
? LlamaLogLevel.debug
: (logLevel ?? LlamaLogLevel.debug);

if (!this._debug) {
const previouslyLoaded = bindings.markLoaded();

if (!this._debug && (!skipLlamaInit || !previouslyLoaded)) {
this._bindings.setLogger(this._onAddonLog);
this._bindings.setLoggerLogLevel(LlamaLogLevelToAddonLogLevel.get(this._logLevel) ?? defaultLogLevel);
}
Expand Down Expand Up @@ -576,7 +579,8 @@ export class Llama {
vramPadding: vramOrchestrator.reserveMemory(0),
ramOrchestrator,
ramPadding: resolvedRamPadding,
swapOrchestrator
swapOrchestrator,
skipLlamaInit
});

if (llama.gpu === false || vramPadding === 0) {
Expand Down Expand Up @@ -688,6 +692,8 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil
return LlamaLogLevel.info;
else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU"))
return LlamaLogLevel.info;
else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for"))
return LlamaLogLevel.info;

return level;
}
18 changes: 13 additions & 5 deletions src/bindings/utils/compileLLamaCpp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,20 @@ export async function compileLlamaCpp(buildOptions: BuildOptions, compileOptions
if (!cmakeCustomOptions.has("GGML_CCACHE"))
cmakeCustomOptions.set("GGML_CCACHE", "OFF");

if (!cmakeCustomOptions.has("LLAMA_CURL") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_CURL"))) {
cmakeCustomOptions.set("LLAMA_CURL", "OFF");
// avoid linking to extra libraries that we don't use
{
if (!cmakeCustomOptions.has("LLAMA_CURL") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_CURL")))
cmakeCustomOptions.set("LLAMA_CURL", "OFF");

// avoid linking to extra libraries that we don't use
if (!cmakeCustomOptions.has("LLAMA_OPENSSL"))
cmakeCustomOptions.set("LLAMA_OPENSSL", "OFF");
if (!cmakeCustomOptions.has("LLAMA_HTTPLIB") || isCmakeValueOff(cmakeCustomOptions.get("LLAMA_HTTPLIB"))) {
cmakeCustomOptions.set("LLAMA_HTTPLIB", "OFF");

if (!cmakeCustomOptions.has("LLAMA_BUILD_BORINGSSL"))
cmakeCustomOptions.set("LLAMA_BUILD_BORINGSSL", "OFF");

if (!cmakeCustomOptions.has("LLAMA_OPENSSL"))
cmakeCustomOptions.set("LLAMA_OPENSSL", "OFF");
}
}

if (buildOptions.platform === "win" && buildOptions.arch === "arm64" && !cmakeCustomOptions.has("GGML_OPENMP"))
Expand Down
32 changes: 22 additions & 10 deletions src/bindings/utils/getLlamaWithoutBackend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,28 @@ export async function getLlamaWithoutBackend() {
if (sharedLlamaWithoutBackend != null)
return sharedLlamaWithoutBackend;

sharedLlamaWithoutBackend = await getLlamaForOptions({
gpu: false,
progressLogs: false,
logLevel: LlamaLogLevel.error,
build: "never",
usePrebuiltBinaries: true,
vramPadding: 0
}, {
skipLlamaInit: true
});
try {
sharedLlamaWithoutBackend = await getLlamaForOptions({
gpu: false,
progressLogs: false,
logLevel: LlamaLogLevel.error,
build: "never",
usePrebuiltBinaries: true,
vramPadding: 0
}, {
skipLlamaInit: true
});
} catch (err) {
sharedLlamaWithoutBackend = await getLlamaForOptions({
progressLogs: false,
logLevel: LlamaLogLevel.error,
build: "never",
usePrebuiltBinaries: true,
vramPadding: 0
}, {
skipLlamaInit: true
});
}

return sharedLlamaWithoutBackend;
});
Expand Down
4 changes: 3 additions & 1 deletion src/chatWrappers/QwenChatWrapper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,9 @@ export class QwenChatWrapper extends ChatWrapper {
architecture === GgufArchitectureType.qwen2moe ||
architecture === GgufArchitectureType.qwen2vl ||
architecture === GgufArchitectureType.qwen3 ||
architecture === GgufArchitectureType.qwen3moe
architecture === GgufArchitectureType.qwen3moe ||
architecture === GgufArchitectureType.qwen3vl ||
architecture === GgufArchitectureType.qwen3vlmoe
);
}

Expand Down
3 changes: 3 additions & 0 deletions src/cli/commands/inspect/commands/InspectMeasureCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDes
import {documentationPageUrls} from "../../../../config.js";
import {Llama} from "../../../../bindings/Llama.js";
import {toBytes} from "../../../utils/toBytes.js";
import {padSafeContextSize} from "../../../../evaluator/LlamaContext/utils/padSafeContextSize.js";

type InspectMeasureCommand = {
modelPath?: string,
Expand Down Expand Up @@ -952,6 +953,8 @@ function getContextSizesCheckPlan(trainContextSize: number, tests: number = 10,
if (size < 2)
size = 2;

size = padSafeContextSize(size, "up");

if (res[res.length - 1] === size) {
shouldStop = true;
return;
Expand Down
8 changes: 7 additions & 1 deletion src/cli/commands/source/commands/DownloadCommand.ts
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,13 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) {
console.log();
console.log();
console.log(`${chalk.yellow("Repo:")} ${repo}`);
console.log(`${chalk.yellow("Release:")} ${release}`);
console.log(
chalk.yellow("Release:") + " " + release + (
release === "latest"
? (" " + chalk.gray("(" + githubReleaseTag + ")"))
: ""
)
);
console.log();
console.log(chalk.green("Done"));
}
Expand Down
1 change: 1 addition & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,4 @@ export const documentationPageUrls = {
export const newGithubIssueUrl = "https://github.com/withcatai/node-llama-cpp/issues";
export const recommendedBaseDockerImage = "node:20";
export const minAllowedContextSizeInCalculations = 24;
export const contextSizePad = 256; // source: `GGML_PAD` usage in `llama_context::llama_context` in `llama-context.cpp`
Loading
Loading