diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index cd9666a21..0eb9dad8a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1990,7 +1990,19 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla
uint32_t cell_range_begin = cells.size();
for (uint32_t i = 0; i < cells.size(); ++i) {
- if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+ bool add_cell = true;
+
+ add_cell = add_cell && !cells.is_empty(i);
+ add_cell = add_cell && (seq_id == -1 || cells.seq_has(i, seq_id));
+
+ // check the cell is not SWA-masked
+ if (add_cell && seq_id != -1) {
+ const bool is_masked = llama_hparams::is_masked_swa(n_swa, swa_type, cells.pos_get(i), cells.seq_pos_max(seq_id));
+
+ add_cell = !is_masked;
+ }
+
+ if (add_cell) {
++cell_count;
if (cell_range_begin == cells.size()) {
cell_range_begin = i;
@@ -2246,7 +2258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
sinfo = find_slot(ubatch, false);
if (sinfo.empty()) {
- LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+ LLAMA_LOG_ERROR("%s: failed to find %d available cells in kv cache\n", __func__, cell_count);
return false;
}
diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
index 72088dfa0..6cb3e13a6 100644
--- a/tools/omnivoice/CMakeLists.txt
+++ b/tools/omnivoice/CMakeLists.txt
@@ -78,6 +78,11 @@ set(OMNIVOICE_CORE_SOURCES
# llama + mtmd into a single ABI-stable C surface.
set(OMNIVOICE_FFI_SOURCES
src/eliza-inference-ffi.cpp
+ # Multi-runtime streaming-LLM backend seam (cutover plan M3). The selector
+ # is always compiled; it is inert until a -DELIZA_ENABLE_* accelerator
+ # backend below registers itself, so the default build keeps the in-tree
+ # llama.cpp path.
+ src/llm-backend-selector.cpp
)
# Vendored standalone voice-classifier forward graphs (pure scalar C, no
@@ -220,6 +225,19 @@ endif()
# (the fused build links mtmd anyway), opt out with -DELIZA_ENABLE_VISION=OFF.
option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
+# ELIZA_ENABLE_LITERT — compile the LiteRT-LM in-process streaming-LLM backend
+# (cutover plan M4 — Android NPU: Tensor / Qualcomm QNN / MediaTek NeuroPilot).
+# OFF by default: the selector registers no LiteRT backend and the streaming-LLM
+# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
+# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
+# default. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF)
+
+# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
+# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
+# requires the MLX / CoreML toolchain. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_MLX "Build the CoreML/MLX in-process LLM backend (M5)" OFF)
+
if(TARGET mtmd)
add_library(elizainference SHARED
${OMNIVOICE_CORE_SOURCES}
@@ -271,6 +289,48 @@ if(TARGET mtmd)
${CMAKE_CURRENT_SOURCE_DIR}/../kokoro/include)
target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_KOKORO)
endif()
+ # ── Multi-runtime streaming-LLM accelerator backends (cutover M4/M5) ────
+ # The M3 selector (src/llm-backend-selector.cpp) is always compiled in via
+ # OMNIVOICE_FFI_SOURCES. These accelerator backends each link an external
+ # SDK, so they are opt-in. When a gate is OFF its source is not compiled,
+ # the selector's `#ifdef`-guarded factory declaration + registration drop
+ # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
+ # default desktop/CI build is byte-for-byte the pre-seam behavior.
+ if(ELIZA_ENABLE_LITERT)
+ target_sources(elizainference PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+ target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
+ # LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built
+ # SDK with -DELIZA_LITERT_SDK_DIR=
; the device/host cross-build
+ # links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=.
+ if(ELIZA_LITERT_SDK_DIR)
+ target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
+ target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
+ endif()
+ if(ELIZA_LITERT_LIBS)
+ target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
+ endif()
+ endif()
+ if(ELIZA_ENABLE_MLX)
+ if(NOT APPLE)
+ message(FATAL_ERROR
+ "ELIZA_ENABLE_MLX requires an Apple host (CoreML/MLX are Apple-only).")
+ endif()
+ target_sources(elizainference PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/mlx-coreml-backend.mm)
+ target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_MLX)
+ # MLX C API (ml-explore/mlx-c) via -DELIZA_MLX_C_DIR / -DELIZA_MLX_LIBS,
+ # plus the system CoreML / Metal / Foundation frameworks.
+ if(ELIZA_MLX_C_DIR)
+ target_include_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/include)
+ target_link_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/lib)
+ endif()
+ if(ELIZA_MLX_LIBS)
+ target_link_libraries(elizainference PRIVATE ${ELIZA_MLX_LIBS})
+ endif()
+ target_link_libraries(elizainference PRIVATE
+ "-framework Foundation" "-framework CoreML" "-framework Metal")
+ endif()
set_target_properties(elizainference PROPERTIES
OUTPUT_NAME elizainference
POSITION_INDEPENDENT_CODE ON)
diff --git a/tools/omnivoice/src/backends/litert-backend.cpp b/tools/omnivoice/src/backends/litert-backend.cpp
new file mode 100644
index 000000000..3b3dad137
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-backend.cpp
@@ -0,0 +1,471 @@
+/*
+ * litert-backend.cpp — LiteRT-LM in-process streaming-LLM backend (M4).
+ *
+ * See litert-backend.h for the targeted LiteRT-LM C++ API (repo + commit
+ * date cited there). The real implementation is gated behind
+ * `ELIZA_ENABLE_LITERT`; the default (Linux/desktop) build compiles the stub
+ * branch, which links zero LiteRT-LM SDK headers and reports
+ * `available() == false` so the selector keeps the in-tree llama.cpp path.
+ *
+ * Error contract (native/AGENTS.md §3 + §9): never log, never return a
+ * defaulted result on failure. Every failure path heap-allocates `*out_error`
+ * via litert_set_error() (matching the FFI cpp's eliza_strdup/eliza_set_error
+ * style) and returns the negative ELIZA_* code or nullptr.
+ */
+
+#include "litert-backend.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#if defined(__has_include)
+# if __has_include()
+# include
+# define LITERT_HAVE_FILESYSTEM 1
+# endif
+#endif
+
+/* ── Heap-allocated error strings (mirror eliza-inference-ffi.cpp) ───────── */
+namespace {
+
+char * litert_strdup(const std::string & s) {
+ char * out = static_cast(std::malloc(s.size() + 1));
+ if (!out) return nullptr;
+ std::memcpy(out, s.c_str(), s.size() + 1);
+ return out;
+}
+
+void litert_set_error(char ** out_error, const std::string & msg) {
+ if (!out_error) return;
+ *out_error = litert_strdup(msg);
+}
+
+#if defined(LITERT_HAVE_FILESYSTEM)
+/* Probe /text/ for a *.litertlm artifact. Cheap directory walk,
+ * no model load (LlmBackendFactory::can_serve contract). */
+std::string find_litertlm_artifact(const char * bundle_dir) {
+ if (!bundle_dir || bundle_dir[0] == '\0') return std::string();
+ std::error_code ec;
+ std::filesystem::path text_dir =
+ std::filesystem::path(bundle_dir) / LITERT_BUNDLE_TEXT_SUBDIR;
+ if (!std::filesystem::is_directory(text_dir, ec)) return std::string();
+ for (std::filesystem::directory_iterator it(text_dir, ec), end;
+ !ec && it != end; it.increment(ec)) {
+ if (!it->is_regular_file(ec)) continue;
+ if (it->path().extension() == LITERT_ARTIFACT_EXT) {
+ return it->path().string();
+ }
+ }
+ return std::string();
+}
+#else
+std::string find_litertlm_artifact(const char *) { return std::string(); }
+#endif
+
+} // namespace
+
+/* ════════════════════════════════════════════════════════════════════════ *
+ * REAL implementation — only when ELIZA_ENABLE_LITERT is defined.
+ * Behind this gate we may include LiteRT-LM SDK headers; outside it we
+ * include NONE so the file builds on a host without the SDK.
+ * ════════════════════════════════════════════════════════════════════════ */
+#ifdef ELIZA_ENABLE_LITERT
+
+#include
+#include
+#include
+#include
+#include
+
+/* LiteRT-LM cross-platform C++ runtime. Paths per the repo's bazel layout
+ * (github.com/google-ai-edge/LiteRT-LM, `main`, researched 2026-06-22). */
+#include "runtime/engine/engine.h" // litert::lm::Engine, SessionInterface
+#include "runtime/engine/engine_settings.h" // EngineSettings, SessionConfig, ModelAssets
+#include "runtime/engine/io_types.h" // InputData, InputText, Responses
+
+namespace {
+
+using litert::lm::Backend;
+using litert::lm::Engine;
+using litert::lm::EngineSettings;
+using litert::lm::InputData;
+using litert::lm::InputText;
+using litert::lm::ModelAssets;
+using litert::lm::Responses;
+using litert::lm::SessionConfig;
+
+/* The Session type the templated Engine hands back (Engine::Session is the
+ * public alias EngineT exposes; for Engine it is SessionInterface). */
+using Session = Engine::Session;
+
+/* The accelerator the factory resolved at open(), recorded for diagnostics
+ * and preference reporting. DEVICE-VERIFY: which rung actually initializes is
+ * hardware-dependent and can only be confirmed on an NPU/GPU device. */
+enum class ResolvedAccelerator { kNone, kNpu, kGpu, kCpu };
+
+const char * accelerator_name(ResolvedAccelerator a) {
+ switch (a) {
+ case ResolvedAccelerator::kNpu: return "npu";
+ case ResolvedAccelerator::kGpu: return "gpu";
+ case ResolvedAccelerator::kCpu: return "cpu";
+ default: return "none";
+ }
+}
+
+/* Try to build an Engine for `artifact` on `backend`. Returns the Engine on
+ * success; on failure returns nullptr (the ladder falls through to the next
+ * rung). The error text is captured so the final rung can surface it. */
+std::unique_ptr try_engine(const std::string & artifact,
+ Backend backend,
+ std::string & last_err) {
+ auto model_assets = ModelAssets::Create(artifact);
+ if (!model_assets.ok()) {
+ last_err = std::string(model_assets.status().message());
+ return nullptr;
+ }
+ auto settings = EngineSettings::CreateDefault(*model_assets, backend);
+ if (!settings.ok()) {
+ last_err = std::string(settings.status().message());
+ return nullptr;
+ }
+ auto engine = Engine::CreateEngine(*settings);
+ if (!engine.ok()) {
+ last_err = std::string(engine.status().message());
+ return nullptr;
+ }
+ return std::move(*engine);
+}
+
+/* ── Session: mirrors the FFI streaming pull contract 1:1 ────────────────── */
+class LiteRtBackendSession final : public LlmBackendSession {
+public:
+ LiteRtBackendSession(std::unique_ptr engine,
+ std::unique_ptr session,
+ const eliza_llm_stream_config_t & cfg,
+ ResolvedAccelerator accel)
+ : engine_(std::move(engine)),
+ session_(std::move(session)),
+ accel_(accel),
+ max_tokens_(cfg.max_tokens > 0 ? cfg.max_tokens : 0) {}
+
+ /* prefill: copy the caller's tokens, detokenize through the engine's
+ * tokenizer, and run a LiteRT prefill pass. The FFI hands pre-tokenized
+ * ids (text-model vocab); LiteRT-LM's prefill consumes InputData (text),
+ * so we round-trip ids → text via the shared tokenizer rather than
+ * assuming vocab parity (the .litertlm graph carries its own tokenizer).
+ * DEVICE-VERIFY: id/text round-trip fidelity needs a real .litertlm. */
+ int prefill(const int32_t * token_ids, size_t num_tokens,
+ char ** out_error) override {
+ if (!session_) {
+ litert_set_error(out_error,
+ "[litert-lm] prefill: session is not open");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ if (cancelled_.load(std::memory_order_acquire)) {
+ return ELIZA_ERR_CANCELLED;
+ }
+ std::vector ids;
+ ids.reserve(num_tokens);
+ for (size_t i = 0; i < num_tokens; ++i) ids.push_back(token_ids[i]);
+
+ const std::string text = engine_->GetTokenizer().Detokenize(ids);
+ std::vector contents;
+ contents.emplace_back(InputText(std::string(text)));
+
+ absl::Status st = session_->RunPrefill(contents);
+ if (!st.ok()) {
+ litert_set_error(out_error,
+ std::string("[litert-lm] RunPrefill failed: ") +
+ std::string(st.message()));
+ return ELIZA_ERR_FFI_FAULT;
+ }
+ prefilled_ = true;
+ return ELIZA_OK;
+ }
+
+ /* next: one decode step. LiteRT-LM's RunDecode() returns a Responses
+ * batch; we emit the newly-produced UTF-8 delta as detokenized text and
+ * its token ids. LiteRT-LM has no in-process MTP drafter exposed through
+ * this surface, so drafted/accepted are always 0. Returns 1 (final) at
+ * EOS or the max-token cap, 0 otherwise. */
+ int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+ char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+ int32_t * drafter_accepted_out, char ** out_error) override {
+ if (num_tokens_out) *num_tokens_out = 0;
+ if (text_out && text_cap) text_out[0] = '\0';
+ if (drafter_drafted_out) *drafter_drafted_out = 0;
+ if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+ if (!session_) {
+ litert_set_error(out_error, "[litert-lm] next: session not open");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ if (!prefilled_) {
+ litert_set_error(out_error,
+ "[litert-lm] next: prefill must run before next");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ if (cancelled_.load(std::memory_order_acquire)) {
+ return ELIZA_ERR_CANCELLED;
+ }
+
+ auto responses = session_->RunDecode();
+ if (!responses.ok()) {
+ litert_set_error(out_error,
+ std::string("[litert-lm] RunDecode failed: ") +
+ std::string(responses.status().message()));
+ return ELIZA_ERR_FFI_FAULT;
+ }
+
+ /* RunDecode yields the running candidate texts; GetTexts()[0] is the
+ * cumulative decode for candidate 0. Emit only the suffix produced
+ * since the last step so the FFI streams a delta per pull. */
+ const std::vector & texts = responses->GetTexts();
+ std::string cumulative = texts.empty() ? std::string() : texts.front();
+ std::string delta = compute_delta(cumulative);
+ emitted_chars_ = cumulative.size();
+
+ /* Re-tokenize the delta against the engine tokenizer so the FFI gets
+ * committed text-vocab ids (the same round-trip the prefill used). */
+ std::vector delta_ids = engine_->GetTokenizer().Tokenize(delta);
+ size_t n_emit = delta_ids.size();
+ if (n_emit > tokens_cap) n_emit = tokens_cap;
+ if (tokens_out) {
+ for (size_t i = 0; i < n_emit; ++i) {
+ tokens_out[i] = static_cast(delta_ids[i]);
+ }
+ }
+ if (num_tokens_out) *num_tokens_out = n_emit;
+ if (text_out && text_cap) {
+ const size_t copy = delta.size() < text_cap - 1
+ ? delta.size()
+ : text_cap - 1;
+ std::memcpy(text_out, delta.data(), copy);
+ text_out[copy] = '\0';
+ }
+
+ decoded_tokens_ += static_cast(delta_ids.size());
+ const bool hit_cap =
+ max_tokens_ > 0 && decoded_tokens_ >= max_tokens_;
+ /* DEVICE-VERIFY: the precise EOS signal LiteRT-LM exposes per step is
+ * runtime-version-dependent. A done decode yields no new delta; treat
+ * an empty delta or the token cap as the final step. */
+ const bool eos = delta_ids.empty();
+ return (hit_cap || eos) ? 1 : 0;
+ }
+
+ /* cancel: publish a flag the next decode step observes. Thread-safe. */
+ int cancel() override {
+ cancelled_.store(true, std::memory_order_release);
+ return ELIZA_OK;
+ }
+
+ /* reset: drop a fresh Session from the same Engine (clears KV + sampler).
+ * Reuses the warm Engine (model weights stay resident) — only the
+ * per-generation Session is rebuilt. */
+ int reset() override {
+ auto cfg = SessionConfig::CreateDefault();
+ auto session = engine_->CreateSession(cfg);
+ if (!session.ok()) {
+ /* reset has no out_error param; a failed rebuild leaves the old
+ * session in place and surfaces on the next prefill/next. */
+ return ELIZA_ERR_FFI_FAULT;
+ }
+ session_ = std::move(*session);
+ cancelled_.store(false, std::memory_order_release);
+ prefilled_ = false;
+ decoded_tokens_ = 0;
+ emitted_chars_ = 0;
+ return ELIZA_OK;
+ }
+
+ /* reset_keep: LiteRT-LM's Session does not expose prefix-preserving KV
+ * trimming through this surface, so fall back to a full reset and return 0
+ * (no prefix kept) — never an error (llm-backend.h contract). */
+ int reset_keep(int32_t /*n_keep*/) override {
+ reset();
+ return 0;
+ }
+
+ const char * accelerator() const { return accelerator_name(accel_); }
+
+private:
+ /* The suffix of `cumulative` produced since the last emitted step. */
+ std::string compute_delta(const std::string & cumulative) const {
+ if (cumulative.size() <= emitted_chars_) return std::string();
+ return cumulative.substr(emitted_chars_);
+ }
+
+ std::unique_ptr engine_;
+ std::unique_ptr session_;
+ std::atomic cancelled_{false};
+ bool prefilled_ = false;
+ int32_t decoded_tokens_ = 0;
+ size_t emitted_chars_ = 0;
+ ResolvedAccelerator accel_ = ResolvedAccelerator::kNone;
+ int32_t max_tokens_ = 0;
+};
+
+/* ── Factory ─────────────────────────────────────────────────────────────── */
+class LiteRtBackendFactory final : public LlmBackendFactory {
+public:
+ const char * name() const override { return LITERT_BACKEND_NAME; }
+
+ /* available(): compiled in AND an accelerator (NPU or GPU) initializes on
+ * THIS host. Cheap — must not load a model. We probe by building a minimal
+ * EngineSettings on NPU then GPU with NO model assets; a backend whose
+ * delegate is missing fails settings validation. CPU alone does NOT make
+ * this backend "available" (CPU is the in-tree llama.cpp path's job).
+ * DEVICE-VERIFY: real delegate presence is only knowable on-device. */
+ bool available() const override {
+ return probe_accelerator() != ResolvedAccelerator::kNone;
+ }
+
+ /* can_serve(): a *.litertlm exists under /text/. Cheap probe,
+ * no caching — open() re-resolves the bundle from the context accessor. */
+ bool can_serve(const char * bundle_dir) const override {
+ return !find_litertlm_artifact(bundle_dir).empty();
+ }
+
+ /* preference_rank(): high on Android NPU (the whole reason this backend
+ * exists), modest on a GPU-only fallback, 0 otherwise so llama.cpp wins. */
+ int preference_rank() const override {
+ switch (probe_accelerator()) {
+ case ResolvedAccelerator::kNpu: return 100;
+ case ResolvedAccelerator::kGpu: return 20;
+ default: return 0;
+ }
+ }
+
+ /* open(): resolve the .litertlm under the cached bundle, then walk the
+ * accelerator ladder NPU → GPU → CPU, recording which rung built the
+ * Engine. Builds a default Session and returns the streaming session. */
+ LlmBackendSession * open(EliInferenceContext * ctx,
+ const eliza_llm_stream_config_t * cfg,
+ char ** out_error) override {
+ if (!cfg) {
+ litert_set_error(out_error, "[litert-lm] open: cfg is NULL");
+ return nullptr;
+ }
+ const char * bundle_dir = llm_backend_context_bundle_dir(ctx);
+ const std::string bundle = bundle_dir ? bundle_dir : std::string();
+ std::string artifact = find_litertlm_artifact(bundle.c_str());
+ if (artifact.empty()) {
+ litert_set_error(out_error,
+ std::string("[litert-lm] open: no ") + LITERT_ARTIFACT_EXT +
+ " artifact under " + bundle + "/" + LITERT_BUNDLE_TEXT_SUBDIR);
+ return nullptr;
+ }
+
+ /* Accelerator ladder — NPU first (Qualcomm QNN / MediaTek NeuroPilot /
+ * Google Tensor), then GPU (OpenCL/Metal/WebGPU), then CPU (XNNPACK).
+ * Each rung's failure text is preserved for the final diagnostic.
+ * DEVICE-VERIFY: rung availability is hardware-specific. */
+ struct Rung { Backend backend; ResolvedAccelerator accel; };
+ const Rung ladder[] = {
+ {Backend::NPU, ResolvedAccelerator::kNpu},
+ {Backend::GPU, ResolvedAccelerator::kGpu},
+ {Backend::CPU, ResolvedAccelerator::kCpu},
+ };
+
+ std::unique_ptr engine;
+ ResolvedAccelerator resolved = ResolvedAccelerator::kNone;
+ std::string last_err;
+ for (const Rung & rung : ladder) {
+ engine = try_engine(artifact, rung.backend, last_err);
+ if (engine) {
+ resolved = rung.accel;
+ break;
+ }
+ }
+ if (!engine) {
+ litert_set_error(out_error,
+ std::string("[litert-lm] open: no accelerator could build the "
+ "engine (last error: ") + last_err + ")");
+ return nullptr;
+ }
+
+ auto session_cfg = SessionConfig::CreateDefault();
+ auto session = engine->CreateSession(session_cfg);
+ if (!session.ok()) {
+ litert_set_error(out_error,
+ std::string("[litert-lm] open: CreateSession failed on ") +
+ accelerator_name(resolved) + ": " +
+ std::string(session.status().message()));
+ return nullptr;
+ }
+
+ return new LiteRtBackendSession(std::move(engine), std::move(*session),
+ *cfg, resolved);
+ }
+
+private:
+ /* Build a no-model EngineSettings on NPU then GPU; the first whose
+ * delegate validates marks that rung present. Result is memoized so the
+ * repeated available()/preference_rank() calls are cheap.
+ * DEVICE-VERIFY: settings-only validation is the cheapest honest probe;
+ * the true delegate handshake happens at open() on-device. */
+ ResolvedAccelerator probe_accelerator() const {
+ std::call_once(probe_once_, [this]() {
+ auto empty = ModelAssets::Create(std::string());
+ if (!empty.ok()) { probed_ = ResolvedAccelerator::kNone; return; }
+ if (EngineSettings::CreateDefault(*empty, Backend::NPU).ok()) {
+ probed_ = ResolvedAccelerator::kNpu;
+ } else if (EngineSettings::CreateDefault(*empty, Backend::GPU).ok()) {
+ probed_ = ResolvedAccelerator::kGpu;
+ } else {
+ probed_ = ResolvedAccelerator::kNone;
+ }
+ });
+ return probed_;
+ }
+
+ mutable std::once_flag probe_once_;
+ mutable ResolvedAccelerator probed_ = ResolvedAccelerator::kNone;
+};
+
+} // namespace
+
+LlmBackendFactory * litert_backend_factory() {
+ static LiteRtBackendFactory factory;
+ return &factory;
+}
+
+#else /* ────────────────────────── STUB (no LiteRT-LM SDK) ──────────────── */
+
+/*
+ * Compiled-out stub: zero LiteRT-LM headers, so this builds on any host. The
+ * factory links in as a no-op — available() is false, can_serve() is false,
+ * preference_rank() is 0, and open() returns nullptr + sets `*out_error`
+ * "not compiled in" so the selector cleanly keeps the in-tree llama.cpp path.
+ */
+namespace {
+
+class LiteRtBackendFactoryStub final : public LlmBackendFactory {
+public:
+ const char * name() const override { return LITERT_BACKEND_NAME; }
+ bool available() const override { return false; }
+ bool can_serve(const char * /*bundle_dir*/) const override { return false; }
+ int preference_rank() const override { return 0; }
+
+ LlmBackendSession * open(EliInferenceContext * /*ctx*/,
+ const eliza_llm_stream_config_t * /*cfg*/,
+ char ** out_error) override {
+ litert_set_error(out_error,
+ "[litert-lm] backend not compiled in "
+ "(build with -DELIZA_ENABLE_LITERT to enable the LiteRT-LM NPU path)");
+ return nullptr;
+ }
+};
+
+} // namespace
+
+LlmBackendFactory * litert_backend_factory() {
+ static LiteRtBackendFactoryStub factory;
+ return &factory;
+}
+
+#endif /* ELIZA_ENABLE_LITERT */
diff --git a/tools/omnivoice/src/backends/litert-backend.h b/tools/omnivoice/src/backends/litert-backend.h
new file mode 100644
index 000000000..9096b64d0
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-backend.h
@@ -0,0 +1,73 @@
+#pragma once
+/*
+ * litert-backend.h — LiteRT-LM in-process streaming-LLM backend (cutover plan M4).
+ *
+ * Implements the M3 backend seam (`llm-backend.h`) on top of Google's
+ * LiteRT-LM C++ inference runtime, the in-process path for the Android NPU
+ * tier (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor), with an
+ * optional desktop/iOS GPU fallback. LiteRT-LM is linked INTO
+ * `libelizainference` and exposed behind the same FFI streaming symbols —
+ * never a child process or TCP server (native/AGENTS.md §11, gemma4 cutover).
+ *
+ * The whole real implementation is gated behind the CMake define
+ * `ELIZA_ENABLE_LITERT`. When that flag is OFF this header pulls in NO
+ * LiteRT-LM SDK headers, so the file compiles on a host without the SDK and
+ * the factory links in as a no-op: `available()` is false and `open()`
+ * returns nullptr + sets `*out_error` "not compiled in".
+ *
+ * ── Targeted runtime API (researched 2026-06-22) ──────────────────────────
+ * Repo: https://github.com/google-ai-edge/LiteRT-LM (`main`)
+ * Docs: https://developers.google.com/edge/litert-lm/cpp
+ * https://ai.google.dev/edge/litert/next/litert_lm_npu
+ * Namespace: `litert::lm`
+ *
+ * Symbols this backend targets (verbatim from the headers above):
+ * - runtime/engine/engine.h
+ * using Engine = EngineT;
+ * static absl::StatusOr>
+ * Engine::CreateEngine(const EngineSettings&);
+ * absl::StatusOr>
+ * EngineT::CreateSession(const SessionConfig&);
+ * - runtime/engine/engine.h (SessionInterface)
+ * absl::Status RunPrefill(const std::vector&);
+ * absl::StatusOr RunDecode();
+ * absl::StatusOr RunDecode(const DecodeConfig&);
+ * absl::Status GenerateContentStream(
+ * const std::vector&,
+ * absl::AnyInvocable)>);
+ * - runtime/engine/engine_settings.h
+ * static absl::StatusOr EngineSettings::CreateDefault(
+ * ModelAssets, Backend backend = Backend::CPU,
+ * std::optional vision_backend = std::nullopt,
+ * std::optional audio_backend = std::nullopt,
+ * std::optional sampler_backend = std::nullopt);
+ * static SessionConfig SessionConfig::CreateDefault();
+ * absl::StatusOr ModelAssets::Create(); // .litertlm
+ * - runtime/engine/io_types.h
+ * using InputData = std::variant;
+ * class InputText { explicit InputText(std::variant); };
+ * class Responses { const std::vector& GetTexts() const; };
+ * - runtime/proto/engine.pb.h
+ * enum Backend { ... CPU, GPU, NPU, ... }; // litert::lm::Backend
+ *
+ * Accelerator ladder (Android NPU first): the factory tries NPU, then GPU,
+ * then CPU at `open()` and records which one initialized. Every
+ * hardware-gated assumption is tagged `DEVICE-VERIFY` in the .cpp — the
+ * accelerator ladder, the .litertlm graph fit, and tok/s can only be
+ * confirmed on a real NPU device, which this scaffold does not have.
+ */
+
+#include "../llm-backend.h"
+
+/* Stable id matched case-insensitively against ELIZA_LLM_BACKEND, and the
+ * subdir + artifact extension the factory probes under /text/. */
+#define LITERT_BACKEND_NAME "litert-lm"
+#define LITERT_BUNDLE_TEXT_SUBDIR "text"
+#define LITERT_ARTIFACT_EXT ".litertlm"
+
+/* Singleton factory accessor. The selector (llm-backend-selector.cpp) calls
+ * this from `llm_backend_register_builtins()` to register the backend. The
+ * returned pointer is a static-lifetime singleton the registry does not own.
+ * Defined unconditionally — a build without ELIZA_ENABLE_LITERT returns a
+ * stub factory whose available() is false. */
+LlmBackendFactory * litert_backend_factory();
diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.h b/tools/omnivoice/src/backends/mlx-coreml-backend.h
new file mode 100644
index 000000000..36d048c00
--- /dev/null
+++ b/tools/omnivoice/src/backends/mlx-coreml-backend.h
@@ -0,0 +1,128 @@
+#pragma once
+/*
+ * mlx-coreml-backend.h — Apple-Silicon in-process streaming-LLM backend
+ * (Gemma-4 cutover plan M5). One of the alternate `LlmBackendSession` /
+ * `LlmBackendFactory` implementations behind the multi-runtime FFI seam
+ * defined in `../llm-backend.h` (cutover plan M3).
+ *
+ * Per native/AGENTS.md §11 ("one managed library, one pipe, no
+ * sidecar/subprocess/TCP") this backend is COMPILED INTO libelizainference
+ * and exposes the SAME `eliza_inference_llm_stream_*` FFI pull contract —
+ * it is the owned backend on Apple Silicon (mac first, iOS later), never a
+ * child process. Apple Foundation Models stays an opportunistic out-of-
+ * process adapter on the TS side and is NOT registered here.
+ *
+ * ── Two runtimes, one backend ─────────────────────────────────────────────
+ *
+ * The same `mlx-coreml` factory can serve a bundle through EITHER of two
+ * Apple on-device runtimes, picked at open() time from the artifact present
+ * under `/text/`:
+ *
+ * • MLX (PRIMARY) — Apple's array framework for Apple Silicon. We drive
+ * it through the C API `mlx-c` (ml-explore/mlx-c). The
+ * text weights are an `mlx` weights dir (safetensors,
+ * the mlx-lm convention) OR a `*.gguf` MLX reads via
+ * `mlx_load_gguf`. Decode runs the transformer graph
+ * on the Metal GPU stream with `mlx_quantized_matmul`
+ * for the quantized weight banks,
+ * `mlx_fast_scaled_dot_product_attention` for
+ * attention, and `mlx_fast_rope` for position. The KV
+ * cache is a pair of resident `mlx_array`s we append to
+ * per step (host-side cache handle, GPU-resident data).
+ * This is the preferred path: it gives us full control
+ * of the sampler, supports the Gemma SWA/shared-KV
+ * geometry, and matches mlx-lm's published Gemma graph.
+ *
+ * • CoreML (ALTERNATE) — Apple's MLModel runtime, which can place the graph
+ * on the ANE (Apple Neural Engine) as well as GPU/CPU.
+ * We load a compiled `*.mlmodelc` / `*.mlpackage`
+ * decoder and use the iOS-18 / macOS-15 **stateful**
+ * prediction API (`MLState`) so the KV cache lives
+ * inside CoreML and is updated in-place across decode
+ * steps (no per-token KV tensor marshalled across the
+ * ObjC boundary). CoreML needs Objective-C, which is
+ * why this whole backend is a `.mm` translation unit.
+ *
+ * TRADE-OFF (documented per the task brief): MLX is the primary path
+ * because it is the most flexible (custom sampler, exact Gemma geometry,
+ * speculative-decode-ready) and tracks mlx-lm directly; its decode runs on
+ * the GPU stream, not the ANE. CoreML's stateful MLModel can target the ANE
+ * for lower power on phones, but the decoder graph must be pre-compiled
+ * ahead of time, the sampler/KV layout is fixed by the converted model, and
+ * ANE placement of large attention graphs is fragile across OS revisions.
+ * We prefer MLX on mac/dev; CoreML is the alternate for ANE-bound iOS tiers
+ * once a stateful decoder package is published. open() selects MLX when an
+ * mlx weights dir / gguf is present, else falls back to the CoreML package.
+ *
+ * ── Build gate ────────────────────────────────────────────────────────────
+ *
+ * The REAL implementation is gated behind `ELIZA_ENABLE_MLX` (the CMake
+ * define for this backend, per the cutover plan: LiteRT → ELIZA_ENABLE_LITERT,
+ * MLX/CoreML → ELIZA_ENABLE_MLX) AND `__APPLE__`. When the gate is OFF the
+ * translation unit includes NO Apple/MLX SDK headers, so it compiles on a
+ * plain Linux host: `available()` returns false, `can_serve()` returns false,
+ * and `open()` returns nullptr after setting `*out_error` ("not compiled in").
+ * The default Linux build links it as a pure no-op and the selector skips it,
+ * keeping the in-tree llama.cpp path.
+ *
+ * ── API research (cited; symbols verified, not invented) ──────────────────
+ *
+ * MLX C API — ml-explore/mlx-c, `mlx/c/` headers, main @ 2026-06 (docs MLX C
+ * 0.4.1, https://ml-explore.github.io/mlx-c/). Symbols used by the real path:
+ * - device.h : `mlx_device mlx_device_new_type(mlx_device_type, int)` with
+ * `typedef enum { MLX_CPU, MLX_GPU } mlx_device_type;`
+ * - stream.h : `mlx_stream mlx_default_gpu_stream_new(void)`,
+ * `mlx_stream mlx_default_cpu_stream_new(void)`
+ * - io.h : `int mlx_load_safetensors(mlx_map_string_to_array*,
+ * mlx_map_string_to_string*, const char* file, mlx_stream)`,
+ * `int mlx_load_gguf(mlx_io_gguf*, const char* file, mlx_stream)`
+ * - array.h : `mlx_array mlx_array_new_data(const void*, const int* shape,
+ * int dim, mlx_dtype)`, `int mlx_array_eval(mlx_array)`,
+ * `int mlx_array_item_int32(int32_t*, mlx_array)`,
+ * `const float* mlx_array_data_float32(mlx_array)`,
+ * `int mlx_array_free(mlx_array)`
+ * - ops.h : `int mlx_quantized_matmul(mlx_array*, x, w, scales, biases,
+ * bool transpose, mlx_optional_int group_size,
+ * mlx_optional_int bits, const char* mode, mlx_stream)`,
+ * `int mlx_matmul(...)`, `int mlx_softmax_axes(...)`,
+ * `int mlx_argmax_axis(mlx_array*, a, int axis, bool, stream)`,
+ * `int mlx_take(mlx_array*, a, indices, stream)`,
+ * `int mlx_astype(...)`, `int mlx_concatenate(...)`
+ * - fast.h : `int mlx_fast_scaled_dot_product_attention(mlx_array*, q, k,
+ * v, float scale, const char* mask_mode, mlx_array mask,
+ * mlx_array sinks, mlx_stream)`,
+ * `int mlx_fast_rope(mlx_array*, x, int dims, bool traditional,
+ * mlx_optional_float base, float scale, int offset,
+ * mlx_array freqs, mlx_stream)`
+ * Gemma on MLX: ml-explore/mlx-lm (`mlx_lm/models/gemma*.py`) — the reference
+ * for the dense SWA + shared-KV + dual-head-dim graph this backend mirrors.
+ *
+ * CoreML stateful KV-cache — Apple Core ML, MLState API, macOS 15 / iOS 18
+ * (WWDC24 "Bring your ML and AI models to Apple silicon"; coremltools
+ * Stateful Models guide, https://apple.github.io/coremltools/docs-guides/
+ * source/stateful-models.html). ObjC symbols used:
+ * - `+ (nullable instancetype)modelWithContentsOfURL:(NSURL*)url
+ * error:(NSError**)error;` (and the compiled-model `compileModelAtURL:`)
+ * - `- (MLState*)newState;` (creates zeroed KV state buffers; MLState is
+ * +new/-init UNAVAILABLE — only MLModel vends it)
+ * - `- (nullable id)predictionFromFeatures:
+ * (id)input usingState:(MLState*)state
+ * error:(NSError**)error;` (the in-place stateful decode step)
+ * Apple's own "On-Device Llama 3.1 with Core ML" research post documents the
+ * prefill-then-stateful-decode loop this backend's MLX/CoreML paths follow.
+ *
+ * Every hardware-specific assumption that can only be confirmed on Apple
+ * Silicon is marked `DEVICE-VERIFY` in the .mm. This header carries no SDK
+ * dependency and is safe to include anywhere.
+ */
+
+#include "../llm-backend.h"
+
+/* Free-function accessor returning the singleton `mlx-coreml` factory so the
+ * selector (llm-backend-selector.cpp, wired separately) can register it via
+ * `llm_backend_register(mlx_coreml_backend_factory())`. Defined in
+ * mlx-coreml-backend.mm. Always returns a valid non-null static-lifetime
+ * pointer — when the build gate is OFF the returned factory reports
+ * available()/can_serve() == false and open() == nullptr ("not compiled in"),
+ * so registering it unconditionally is safe. */
+LlmBackendFactory * mlx_coreml_backend_factory();
diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.mm b/tools/omnivoice/src/backends/mlx-coreml-backend.mm
new file mode 100644
index 000000000..4b705d719
--- /dev/null
+++ b/tools/omnivoice/src/backends/mlx-coreml-backend.mm
@@ -0,0 +1,797 @@
+/*
+ * mlx-coreml-backend.mm — Apple-Silicon streaming-LLM backend (cutover M5).
+ *
+ * Objective-C++ translation unit: CoreML's MLModel / MLState API is
+ * Objective-C, and the MLX C++ / mlx-c headers also compile cleanly in a
+ * `.mm`. See mlx-coreml-backend.h for the full API research + citations and
+ * the MLX-primary / CoreML-alternate trade-off.
+ *
+ * STRUCTURE
+ * The whole real implementation sits behind
+ * #if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+ * and is the ONLY place that includes any MLX / CoreML SDK header. With the
+ * gate OFF (the default Linux build) this file pulls in no SDK header at all
+ * and compiles to a pure no-op factory: available()/can_serve() == false,
+ * open() returns nullptr after setting *out_error to "not compiled in".
+ *
+ * ERROR CONTRACT (native/AGENTS.md §3 + §9): never log, never return a
+ * defaulted result on failure. Out-error strings are heap-allocated with
+ * malloc (mirroring eliza-inference-ffi.cpp's `eliza_strdup`) so the FFI
+ * caller frees them with `eliza_inference_free_string` / free().
+ */
+
+#include "mlx-coreml-backend.h"
+
+#include
+#include
+#include
+#include
+
+// ===========================================================================
+// Shared (gate-independent) helpers
+// ===========================================================================
+
+namespace {
+
+/* Heap-allocate an out-error string the way the FFI translation unit does
+ * (eliza-inference-ffi.cpp::eliza_strdup) so the caller's free() path is
+ * identical regardless of which backend produced the error. */
+void mlx_set_error(char ** out_error, const std::string & msg) {
+ if (!out_error) {
+ return;
+ }
+ char * out = static_cast(std::malloc(msg.size() + 1));
+ if (!out) {
+ *out_error = nullptr;
+ return;
+ }
+ std::memcpy(out, msg.c_str(), msg.size() + 1);
+ *out_error = out;
+}
+
+} // namespace
+
+// ===========================================================================
+// REAL IMPLEMENTATION — Apple Silicon only, gated on ELIZA_ENABLE_MLX
+// ===========================================================================
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+
+// --- Objective-C / Apple frameworks ---------------------------------------
+#import
+#import // MLModel, MLState, MLFeatureProvider, MLMultiArray
+#import // MTLCreateSystemDefaultDevice — Metal/ANE presence probe
+
+// --- MLX C API (ml-explore/mlx-c) ------------------------------------------
+// Only included behind the gate so a host without the MLX SDK still compiles.
+#include "mlx/c/array.h"
+#include "mlx/c/device.h"
+#include "mlx/c/stream.h"
+#include "mlx/c/io.h"
+#include "mlx/c/ops.h"
+#include "mlx/c/fast.h"
+#include "mlx/c/map.h"
+
+#include
+#include
+#include
+#include
+#include
+
+namespace {
+
+namespace fs = std::filesystem;
+
+// --- bundle artifact discovery --------------------------------------------
+
+enum class AppleRuntime {
+ None,
+ Mlx, // mlx weights dir (safetensors) or *.gguf under text/
+ CoreMl, // *.mlmodelc / *.mlpackage under text/
+};
+
+bool has_suffix(const std::string & s, const char * suffix) {
+ const size_t n = std::strlen(suffix);
+ return s.size() >= n && std::equal(s.end() - n, s.end(), suffix);
+}
+
+/* Probe `/text/` for an Apple-servable artifact and report which
+ * runtime would serve it. MLX is preferred when both kinds are present (an
+ * mlx weights dir / gguf wins over a CoreML package), matching the header's
+ * "MLX primary, CoreML alternate" rule. Cheap directory walk, no model load. */
+AppleRuntime detect_runtime(const char * bundle_dir, std::string & out_artifact) {
+ out_artifact.clear();
+ if (!bundle_dir || bundle_dir[0] == '\0') {
+ return AppleRuntime::None;
+ }
+ std::error_code ec;
+ fs::path text_dir = fs::path(bundle_dir) / "text";
+ if (!fs::is_directory(text_dir, ec)) {
+ return AppleRuntime::None;
+ }
+
+ std::string gguf, mlpackage, mlmodelc, mlx_weights_dir;
+ for (fs::directory_iterator it(text_dir, ec), end; it != end && !ec; it.increment(ec)) {
+ const fs::path & p = it->path();
+ const std::string name = p.filename().string();
+ if (it->is_directory(ec)) {
+ // mlx-lm exports an `mlx` weights dir (model.safetensors + config.json),
+ // or a *.mlmodelc compiled CoreML model is itself a directory.
+ if (has_suffix(name, ".mlmodelc")) {
+ if (mlmodelc.empty()) mlmodelc = p.string();
+ } else if (name == "mlx" || fs::exists(p / "model.safetensors", ec) ||
+ fs::exists(p / "weights.safetensors", ec)) {
+ if (mlx_weights_dir.empty()) mlx_weights_dir = p.string();
+ }
+ } else {
+ if (has_suffix(name, ".gguf")) {
+ if (gguf.empty()) gguf = p.string();
+ } else if (has_suffix(name, ".mlpackage")) {
+ if (mlpackage.empty()) mlpackage = p.string();
+ } else if (has_suffix(name, ".safetensors")) {
+ if (mlx_weights_dir.empty()) mlx_weights_dir = text_dir.string();
+ }
+ }
+ }
+
+ // MLX primary: weights dir / safetensors first, then gguf.
+ if (!mlx_weights_dir.empty()) { out_artifact = mlx_weights_dir; return AppleRuntime::Mlx; }
+ if (!gguf.empty()) { out_artifact = gguf; return AppleRuntime::Mlx; }
+ // CoreML alternate: compiled model, then package.
+ if (!mlmodelc.empty()) { out_artifact = mlmodelc; return AppleRuntime::CoreMl; }
+ if (!mlpackage.empty()) { out_artifact = mlpackage; return AppleRuntime::CoreMl; }
+ return AppleRuntime::None;
+}
+
+/* True when a Metal device (hence GPU + ANE on Apple Silicon) is present.
+ * DEVICE-VERIFY: on a real Apple-Silicon Mac/phone this returns a valid
+ * MTLDevice; on a Mac without Metal (or an unexpected host) it is nil and the
+ * backend reports unavailable rather than crashing at open(). */
+bool metal_device_present() {
+ @autoreleasepool {
+ id dev = MTLCreateSystemDefaultDevice();
+ return dev != nil;
+ }
+}
+
+// ===========================================================================
+// MLX-backed session (PRIMARY)
+// ===========================================================================
+//
+// DEVICE-VERIFY: the decode graph below is structurally complete and uses the
+// real mlx-c symbols, but the exact per-layer wiring of the Gemma graph
+// (alternating local-SWA / global attention, dual head dims, shared-KV layer
+// reuse, Per-Layer-Embeddings) must be assembled + numerically validated on
+// Apple Silicon against mlx-lm's `gemma*` reference. The weight-tensor names,
+// quant group_size/bits, and rope base/scale are read from the model config at
+// load; they are not hardcoded here.
+
+class MlxLlmSession final : public LlmBackendSession {
+public:
+ MlxLlmSession(std::string artifact, const eliza_llm_stream_config_t * cfg)
+ : artifact_(std::move(artifact)) {
+ if (cfg) {
+ cfg_ = *cfg;
+ have_cfg_ = true;
+ }
+ }
+
+ ~MlxLlmSession() override {
+ free_kv();
+ // mlx_array handles are value types wrapping a refcounted ctx; freeing
+ // releases our reference. The Metal stream/device are process-global.
+ }
+
+ /* Load weights + build the resident graph. Returns ELIZA_OK or negative.
+ *
+ * The two on-disk shapes are loaded with the two distinct mlx-c readers:
+ * - safetensors (mlx-lm convention): mlx_load_safetensors fills a
+ * mlx_map_string_to_array keyed by tensor name (looked up per-tensor
+ * via mlx_map_string_to_array_get when the graph is assembled);
+ * - gguf: mlx_load_gguf fills a mlx_io_gguf whose tensors are read by
+ * key via mlx_io_gguf_get_array (key list from mlx_io_gguf_get_keys).
+ * We keep whichever handle we loaded resident; the per-tensor pulls happen
+ * inside run_forward when the Gemma graph is assembled on Metal. */
+ int init(char ** out_error) {
+ // GPU stream (Metal). DEVICE-VERIFY: requires a Metal device.
+ gpu_stream_ = mlx_default_gpu_stream_new();
+
+ int rc;
+ if (has_suffix(artifact_, ".gguf")) {
+ gguf_ = mlx_io_gguf_new();
+ rc = mlx_load_gguf(&gguf_, artifact_.c_str(), gpu_stream_);
+ if (rc == 0) {
+ have_gguf_ = true;
+ }
+ } else {
+ // mlx weights dir / safetensors (the mlx-lm convention).
+ std::string file = artifact_;
+ std::error_code ec;
+ if (fs::is_directory(file, ec)) {
+ if (fs::exists(fs::path(file) / "model.safetensors", ec)) {
+ file = (fs::path(file) / "model.safetensors").string();
+ } else if (fs::exists(fs::path(file) / "weights.safetensors", ec)) {
+ file = (fs::path(file) / "weights.safetensors").string();
+ }
+ }
+ weights_ = mlx_map_string_to_array_new();
+ weights_meta_ = mlx_map_string_to_string_new();
+ rc = mlx_load_safetensors(&weights_, &weights_meta_, file.c_str(), gpu_stream_);
+ if (rc == 0) {
+ have_weights_ = true;
+ }
+ }
+ if (rc != 0) {
+ free_weights();
+ mlx_set_error(out_error,
+ "[mlx-coreml] MLX failed to load weights from " + artifact_);
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+
+ // DEVICE-VERIFY: parse the sibling config.json (vocab, n_layer, head
+ // dims global/swa, sliding-window, rope base, shared-KV layer map, PLE
+ // table, quant bits/group_size) into graph_ here. Mirrors
+ // mlx_lm.utils.load's config handling. Left as the on-Metal assembly
+ // step — the streaming contract below does not depend on its details.
+ return ELIZA_OK;
+ }
+
+ int prefill(const int32_t * token_ids, size_t num_tokens,
+ char ** out_error) override {
+ if (!have_weights_) {
+ mlx_set_error(out_error, "[mlx-coreml] prefill before init");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ if (!token_ids || num_tokens == 0) {
+ mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ cancel_.store(false);
+
+ // Copy the prompt (the contract says prefill copies the tokens it needs).
+ prompt_.assign(token_ids, token_ids + num_tokens);
+ n_past_ = 0;
+ generated_ = 0;
+
+ // Build the [1, T] int32 input and run one forward pass that fills KV.
+ // DEVICE-VERIFY: run_forward() must execute the Gemma decoder over the
+ // whole prompt at positions [0, T) and append to the resident KV
+ // arrays. The final-position logits feed the first sampled token.
+ const int shape[2] = {1, static_cast(num_tokens)};
+ mlx_array input = mlx_array_new_data(prompt_.data(), shape, 2, MLX_INT32);
+ int rc = run_forward(input, /*start_pos=*/0, &last_logits_, out_error);
+ mlx_array_free(input);
+ if (rc != ELIZA_OK) {
+ return rc;
+ }
+ n_past_ = static_cast(num_tokens);
+ return ELIZA_OK;
+ }
+
+ int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+ char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+ int32_t * drafter_accepted_out, char ** out_error) override {
+ if (num_tokens_out) *num_tokens_out = 0;
+ if (text_out && text_cap) text_out[0] = '\0';
+ // No speculative drafter on the MLX path yet (M6 wires MTP).
+ if (drafter_drafted_out) *drafter_drafted_out = 0;
+ if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+ if (!have_weights_) {
+ mlx_set_error(out_error, "[mlx-coreml] next before init/prefill");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ if (cancel_.load()) {
+ return ELIZA_ERR_CANCELLED;
+ }
+ if (!tokens_out || tokens_cap == 0) {
+ mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+
+ // Sample one token from last_logits_ (greedy here; temperature / top-p /
+ // top-k from cfg_ applied in sample_token).
+ // DEVICE-VERIFY: sample_token reads last_logits_ (an mlx_array of shape
+ // [1, vocab]) and returns one int32 token id.
+ int32_t next_id = 0;
+ int rc = sample_token(last_logits_, &next_id, out_error);
+ if (rc != ELIZA_OK) {
+ return rc;
+ }
+
+ tokens_out[0] = next_id;
+ if (num_tokens_out) *num_tokens_out = 1;
+ generated_++;
+
+ // Detokenize the single committed token into text_out (UTF-8).
+ // DEVICE-VERIFY: detokenize_piece resolves next_id against the model's
+ // vocab (loaded from the tokenizer sidecar / gguf vocab) and writes the
+ // UTF-8 piece. Partial multi-byte pieces are buffered across calls.
+ detokenize_piece(next_id, text_out, text_cap);
+
+ const bool hit_eos = is_eos(next_id);
+ const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0)
+ ? cfg_.max_tokens
+ : default_max_tokens_;
+ const bool hit_cap = generated_ >= cap;
+ if (hit_eos || hit_cap) {
+ return 1; // final step
+ }
+
+ // Advance one position: forward pass for the just-sampled token only.
+ const int shape[2] = {1, 1};
+ mlx_array step_in = mlx_array_new_data(&next_id, shape, 2, MLX_INT32);
+ rc = run_forward(step_in, /*start_pos=*/n_past_, &last_logits_, out_error);
+ mlx_array_free(step_in);
+ if (rc != ELIZA_OK) {
+ return rc;
+ }
+ n_past_++;
+ return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more
+ }
+
+ int cancel() override {
+ cancel_.store(true);
+ return ELIZA_OK;
+ }
+
+ int reset() override {
+ cancel_.store(false);
+ prompt_.clear();
+ n_past_ = 0;
+ generated_ = 0;
+ free_kv(); // drop resident KV arrays
+ free_logits();
+ return ELIZA_OK;
+ }
+
+ int reset_keep(int32_t n_keep) override {
+ // MLX KV is a resident pair of arrays we append to; trimming to a prefix
+ // is a tensor slice. DEVICE-VERIFY: when the on-Metal KV slice is wired,
+ // keep [0, n_keep) of the K/V arrays and set n_past_ = clamp(n_keep).
+ // Until that lands, do the contract-mandated SAFE fallback: full reset,
+ // return 0 — never an error (llm-backend.h reset_keep contract).
+ (void) n_keep;
+ reset();
+ return 0;
+ }
+
+private:
+ void free_kv() {
+ if (have_kv_) {
+ mlx_array_free(kv_k_);
+ mlx_array_free(kv_v_);
+ have_kv_ = false;
+ }
+ }
+ void free_logits() {
+ if (have_logits_) {
+ mlx_array_free(last_logits_);
+ have_logits_ = false;
+ }
+ }
+
+ /* One transformer forward pass over `input` ([1, T] int32) starting at
+ * position `start_pos`, appending to the resident KV cache and writing the
+ * final-position logits ([1, vocab]) into *out_logits.
+ *
+ * DEVICE-VERIFY: this is the Gemma decoder graph. It must, per layer:
+ * - embed tokens (+ Per-Layer-Embeddings) ;
+ * - apply mlx_fast_rope with the layer's (global vs SWA) head dim ;
+ * - run mlx_fast_scaled_dot_product_attention with mask_mode "causal" for
+ * global layers and a windowed mask for SWA layers ;
+ * - reuse earlier-layer KV on shared-KV layers ;
+ * - mlx_quantized_matmul for quantized weight banks (group_size/bits from
+ * config), mlx_matmul for f16 banks ;
+ * - mlx_array_eval the result on gpu_stream_ to force materialization.
+ * The scaffolding owns the resident-KV bookkeeping; the per-op assembly is
+ * the on-Metal step validated against mlx-lm. */
+ int run_forward(mlx_array /*input*/, int /*start_pos*/, mlx_array * out_logits,
+ char ** out_error) {
+ // Until the on-Metal graph is assembled, surface a precise, non-default
+ // failure (§3: never return a defaulted result). When the graph lands,
+ // this returns ELIZA_OK with *out_logits set and the KV appended.
+ free_logits();
+ (void) out_logits;
+ mlx_set_error(out_error,
+ "[mlx-coreml] MLX Gemma decode graph not assembled on this build "
+ "(DEVICE-VERIFY: requires Apple Silicon)");
+ return ELIZA_ERR_NOT_IMPLEMENTED;
+ }
+
+ int sample_token(mlx_array logits, int32_t * out_id, char ** out_error) {
+ if (!have_logits_) {
+ mlx_set_error(out_error, "[mlx-coreml] no logits to sample");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ // DEVICE-VERIFY: apply cfg_.temperature / top_p / top_k / repeat_penalty
+ // then categorical sample; greedy argmax shown as the structural default.
+ mlx_array arg = mlx_array_new();
+ if (mlx_argmax_axis(&arg, logits, /*axis=*/-1, /*keepdims=*/false, gpu_stream_) != 0) {
+ mlx_array_free(arg);
+ mlx_set_error(out_error, "[mlx-coreml] argmax failed");
+ return ELIZA_ERR_FFI_FAULT;
+ }
+ mlx_array_eval(arg);
+ int32_t id = 0;
+ const int rc = mlx_array_item_int32(&id, arg);
+ mlx_array_free(arg);
+ if (rc != 0) {
+ mlx_set_error(out_error, "[mlx-coreml] failed to read sampled token");
+ return ELIZA_ERR_FFI_FAULT;
+ }
+ *out_id = id;
+ return ELIZA_OK;
+ }
+
+ bool is_eos(int32_t id) const {
+ // DEVICE-VERIFY: compare against the model's EOS / ids
+ // (Gemma uses ) read from the tokenizer config at load.
+ return id == eos_id_;
+ }
+
+ void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) {
+ // DEVICE-VERIFY: resolve the token piece from the loaded vocab and copy
+ // its UTF-8 bytes (buffering partial code points across calls). The
+ // empty string here keeps the contract intact (committed id is already
+ // in tokens_out) until the vocab path is wired.
+ if (text_out && text_cap) {
+ text_out[0] = '\0';
+ }
+ }
+
+ std::string artifact_;
+ eliza_llm_stream_config_t cfg_{};
+ bool have_cfg_ = false;
+
+ mlx_stream gpu_stream_{};
+ mlx_map_string_to_array weights_{};
+ mlx_map_string_to_string weights_meta_{};
+ bool have_weights_ = false;
+
+ mlx_array kv_k_{};
+ mlx_array kv_v_{};
+ bool have_kv_ = false;
+
+ mlx_array last_logits_{};
+ bool have_logits_ = false;
+
+ std::vector prompt_;
+ int n_past_ = 0;
+ int generated_ = 0;
+ int32_t eos_id_ = -1;
+ int32_t default_max_tokens_ = 2048;
+
+ std::atomic cancel_{false};
+};
+
+// ===========================================================================
+// CoreML-backed session (ALTERNATE — ANE-bound, stateful MLState KV cache)
+// ===========================================================================
+//
+// DEVICE-VERIFY: the converted decoder package must expose (a) an input
+// feature for the current token id(s) and position, (b) an MLState-backed KV
+// cache, and (c) a logits output. Apple's "On-Device Llama 3.1 with Core ML"
+// post is the reference for the prefill-then-stateful-decode loop. We hold the
+// MLModel + its MLState and call predictionFromFeatures:usingState:error: per
+// step so the KV updates in-place inside CoreML (no per-token KV marshalling).
+
+class CoreMlLlmSession final : public LlmBackendSession {
+public:
+ CoreMlLlmSession(std::string package, const eliza_llm_stream_config_t * cfg)
+ : package_(std::move(package)) {
+ if (cfg) {
+ cfg_ = *cfg;
+ have_cfg_ = true;
+ }
+ }
+
+ ~CoreMlLlmSession() override {
+ @autoreleasepool {
+ state_ = nil;
+ model_ = nil;
+ }
+ }
+
+ int init(char ** out_error) {
+ @autoreleasepool {
+ NSError * err = nil;
+ NSURL * url = [NSURL fileURLWithPath:
+ [NSString stringWithUTF8String:package_.c_str()]];
+
+ NSURL * compiled = url;
+ // A *.mlpackage must be compiled to *.mlmodelc before loading; a
+ // *.mlmodelc loads directly. DEVICE-VERIFY: compileModelAtURL is a
+ // synchronous one-time compile; production caches the result.
+ if ([package_.c_str() ? @(package_.c_str()) : @"" hasSuffix:@".mlpackage"]) {
+ NSURL * c = [MLModel compileModelAtURL:url error:&err];
+ if (!c) {
+ mlx_set_error(out_error, std::string(
+ "[mlx-coreml] CoreML compile failed: ") +
+ (err ? err.localizedDescription.UTF8String : "unknown"));
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ compiled = c;
+ }
+
+ MLModelConfiguration * conf = [[MLModelConfiguration alloc] init];
+ // DEVICE-VERIFY: .all lets CoreML place the decoder on ANE when the
+ // converted graph is ANE-eligible, else GPU/CPU.
+ conf.computeUnits = MLComputeUnitsAll;
+
+ model_ = [MLModel modelWithContentsOfURL:compiled
+ configuration:conf
+ error:&err];
+ if (!model_) {
+ mlx_set_error(out_error, std::string(
+ "[mlx-coreml] CoreML model load failed: ") +
+ (err ? err.localizedDescription.UTF8String : "unknown"));
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+
+ // newState vends zeroed KV buffers; MLState is +new/-init
+ // UNAVAILABLE — only MLModel produces it (macOS 15 / iOS 18).
+ state_ = [model_ newState];
+ if (!state_) {
+ mlx_set_error(out_error,
+ "[mlx-coreml] CoreML model has no stateful KV cache "
+ "(newState returned nil) — needs a stateful decoder package");
+ return ELIZA_ERR_BUNDLE_INVALID;
+ }
+ return ELIZA_OK;
+ }
+ }
+
+ int prefill(const int32_t * token_ids, size_t num_tokens,
+ char ** out_error) override {
+ if (!model_ || !state_) {
+ mlx_set_error(out_error, "[mlx-coreml] prefill before init");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ if (!token_ids || num_tokens == 0) {
+ mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ cancel_.store(false);
+ prompt_.assign(token_ids, token_ids + num_tokens);
+ n_past_ = 0;
+ generated_ = 0;
+
+ // DEVICE-VERIFY: feed the whole prompt as one prediction with positions
+ // [0, T) so CoreML fills the MLState KV in one pass, then keep the
+ // final-position logits for the first sampled token. The feature names
+ // ("input_ids", "position", "logits") are dictated by the converted
+ // model's MLModelDescription — read them from model_.modelDescription.
+ return run_step(prompt_.data(), prompt_.size(), /*start_pos=*/0, out_error);
+ }
+
+ int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+ char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+ int32_t * drafter_accepted_out, char ** out_error) override {
+ if (num_tokens_out) *num_tokens_out = 0;
+ if (text_out && text_cap) text_out[0] = '\0';
+ if (drafter_drafted_out) *drafter_drafted_out = 0;
+ if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+ if (!model_ || !state_) {
+ mlx_set_error(out_error, "[mlx-coreml] next before init/prefill");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ if (cancel_.load()) {
+ return ELIZA_ERR_CANCELLED;
+ }
+ if (!tokens_out || tokens_cap == 0) {
+ mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small");
+ return ELIZA_ERR_INVALID_ARG;
+ }
+
+ int32_t next_id = 0;
+ int rc = sample_from_last_logits(&next_id, out_error);
+ if (rc != ELIZA_OK) {
+ return rc;
+ }
+ tokens_out[0] = next_id;
+ if (num_tokens_out) *num_tokens_out = 1;
+ generated_++;
+ detokenize_piece(next_id, text_out, text_cap);
+
+ const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0)
+ ? cfg_.max_tokens
+ : default_max_tokens_;
+ if (is_eos(next_id) || generated_ >= cap) {
+ return 1; // final
+ }
+
+ // One stateful decode step for the just-sampled token.
+ const int32_t one = next_id;
+ rc = run_step(&one, 1, /*start_pos=*/n_past_, out_error);
+ if (rc != ELIZA_OK) {
+ return rc;
+ }
+ n_past_++;
+ return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more
+ }
+
+ int cancel() override {
+ cancel_.store(true);
+ return ELIZA_OK;
+ }
+
+ int reset() override {
+ cancel_.store(false);
+ prompt_.clear();
+ n_past_ = 0;
+ generated_ = 0;
+ @autoreleasepool {
+ // A fresh MLState zeroes the KV cache — the canonical CoreML reset.
+ if (model_) {
+ state_ = [model_ newState];
+ }
+ }
+ return ELIZA_OK;
+ }
+
+ int reset_keep(int32_t n_keep) override {
+ // CoreML's MLState is opaque: there is no public API to truncate the KV
+ // to a prefix. Per the llm-backend.h contract, fall back to a full
+ // reset and return 0 — never an error.
+ (void) n_keep;
+ reset();
+ return 0;
+ }
+
+private:
+ /* Run one prediction (`n` tokens starting at `start_pos`) through the
+ * stateful model, updating the MLState KV in place and caching the
+ * final-position logits. DEVICE-VERIFY: builds an MLFeatureProvider from
+ * the converted model's actual input descriptions and reads the logits
+ * MLMultiArray from the output provider. */
+ int run_step(const int32_t * /*tokens*/, size_t /*n*/, int /*start_pos*/,
+ char ** out_error) {
+ // The feature-name binding is model-specific and only knowable from a
+ // real converted package, so surface a precise failure (§3) rather than
+ // a defaulted success. When the package is wired this calls
+ // predictionFromFeatures:usingState:error: and stores the logits.
+ mlx_set_error(out_error,
+ "[mlx-coreml] CoreML stateful decode not bound to a converted "
+ "decoder package on this build (DEVICE-VERIFY: requires a stateful "
+ "*.mlmodelc and Apple Silicon)");
+ return ELIZA_ERR_NOT_IMPLEMENTED;
+ }
+
+ int sample_from_last_logits(int32_t * /*out_id*/, char ** out_error) {
+ // DEVICE-VERIFY: argmax / temperature-sample over the cached logits
+ // MLMultiArray. Fails precisely until run_step populates them.
+ mlx_set_error(out_error, "[mlx-coreml] no CoreML logits to sample");
+ return ELIZA_ERR_NOT_IMPLEMENTED;
+ }
+
+ bool is_eos(int32_t id) const { return id == eos_id_; }
+
+ void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) {
+ if (text_out && text_cap) {
+ text_out[0] = '\0';
+ }
+ }
+
+ std::string package_;
+ eliza_llm_stream_config_t cfg_{};
+ bool have_cfg_ = false;
+
+ MLModel * model_ = nil;
+ MLState * state_ = nil;
+
+ std::vector prompt_;
+ int n_past_ = 0;
+ int generated_ = 0;
+ int32_t eos_id_ = -1;
+ int32_t default_max_tokens_ = 2048;
+
+ std::atomic cancel_{false};
+};
+
+// ===========================================================================
+// Factory (real)
+// ===========================================================================
+
+class MlxCoreMlFactory final : public LlmBackendFactory {
+public:
+ const char * name() const override { return "mlx-coreml"; }
+
+ bool available() const override {
+ // Compiled in (we are inside the gate) AND a Metal device is present.
+ // DEVICE-VERIFY: true on Apple Silicon; false on a Mac without Metal.
+ return metal_device_present();
+ }
+
+ bool can_serve(const char * bundle_dir) const override {
+ std::string artifact;
+ return detect_runtime(bundle_dir, artifact) != AppleRuntime::None;
+ }
+
+ int preference_rank() const override {
+ // Highest on Apple Silicon: the in-process Metal/ANE path beats the
+ // in-tree llama.cpp Metal path for the Gemma geometry. > LiteRT(0 here).
+ return 100;
+ }
+
+ LlmBackendSession * open(EliInferenceContext * ctx,
+ const eliza_llm_stream_config_t * cfg,
+ char ** out_error) override {
+ // Resolve the bundle root from the context accessor (the struct is
+ // otherwise opaque here), then pick MLX vs CoreML from its artifacts.
+ const char * bundle_dir = llm_backend_context_bundle_dir(ctx);
+ const std::string bundle = bundle_dir ? bundle_dir : std::string();
+ if (bundle.empty()) {
+ mlx_set_error(out_error,
+ "[mlx-coreml] open: context has no bundle dir");
+ return nullptr;
+ }
+ std::string artifact;
+ const AppleRuntime rt = detect_runtime(bundle.c_str(), artifact);
+ if (rt == AppleRuntime::Mlx) {
+ auto * s = new MlxLlmSession(artifact, cfg);
+ const int rc = s->init(out_error);
+ if (rc != ELIZA_OK) {
+ delete s;
+ return nullptr;
+ }
+ return s;
+ }
+ if (rt == AppleRuntime::CoreMl) {
+ auto * s = new CoreMlLlmSession(artifact, cfg);
+ const int rc = s->init(out_error);
+ if (rc != ELIZA_OK) {
+ delete s;
+ return nullptr;
+ }
+ return s;
+ }
+ mlx_set_error(out_error,
+ "[mlx-coreml] open: bundle has no MLX/CoreML text artifact under text/");
+ return nullptr;
+ }
+};
+
+MlxCoreMlFactory g_factory;
+
+} // namespace
+
+LlmBackendFactory * mlx_coreml_backend_factory() {
+ return &g_factory;
+}
+
+// ===========================================================================
+// STUB IMPLEMENTATION — every non-Apple / gate-OFF build
+// ===========================================================================
+#else // !(ELIZA_ENABLE_MLX && __APPLE__)
+
+namespace {
+
+/* No SDK header is included on this path, so the file compiles on a plain
+ * Linux host. The factory reports itself unavailable and refuses to open. */
+class MlxCoreMlStubFactory final : public LlmBackendFactory {
+public:
+ const char * name() const override { return "mlx-coreml"; }
+ bool available() const override { return false; }
+ bool can_serve(const char * /*bundle_dir*/) const override { return false; }
+ int preference_rank() const override { return 0; }
+
+ LlmBackendSession * open(EliInferenceContext * /*ctx*/,
+ const eliza_llm_stream_config_t * /*cfg*/,
+ char ** out_error) override {
+ mlx_set_error(out_error,
+ "[mlx-coreml] backend not compiled in "
+ "(needs -DELIZA_ENABLE_MLX on Apple Silicon)");
+ return nullptr;
+ }
+};
+
+MlxCoreMlStubFactory g_stub_factory;
+
+} // namespace
+
+LlmBackendFactory * mlx_coreml_backend_factory() {
+ return &g_stub_factory;
+}
+
+#endif // ELIZA_ENABLE_MLX && __APPLE__
diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp
index 345c87cb0..94127affc 100644
--- a/tools/omnivoice/src/eliza-inference-ffi.cpp
+++ b/tools/omnivoice/src/eliza-inference-ffi.cpp
@@ -14,6 +14,7 @@
// resolve `eliza_inference_*` symbols from this object.
#include "eliza-inference-ffi.h"
+#include "llm-backend.h"
#include "omnivoice.h"
#include "llama.h"
#include "mtmd.h"
@@ -173,6 +174,13 @@ struct EliInferenceContext {
#endif
};
+/* M3 seam accessor (declared in llm-backend.h): hand a backend's open() the
+ * bundle root without exposing the struct. Defined here where the type is
+ * complete. */
+const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx) {
+ return ctx ? ctx->bundle_dir.c_str() : nullptr;
+}
+
/* ELZ2 magic 'ELZ1' (the ascii bytes 'E','L','Z','1' little-endian).
* The magic stays 'ELZ1' across format versions — only the version
* word at offset 4 changes between v1 and v2. */
@@ -1135,6 +1143,11 @@ static void reset_engine(Engine * e) {
struct EliLlmStream {
EliInferenceContext * ctx = nullptr;
+ /* Multi-backend seam (M3): when non-NULL, this session is driven by an
+ * alternate in-process runtime (LiteRT-LM / MLX-CoreML) and the llama.cpp
+ * fields below (lctx/sampler/mtp) are unused — every FFI streaming entry
+ * delegates to `backend` and returns before touching the llama.cpp path. */
+ LlmBackendSession * backend = nullptr;
llama_context * lctx = nullptr;
llama_sampler * sampler = nullptr;
int n_past = 0;
@@ -2887,6 +2900,40 @@ EliLlmStream * eliza_inference_llm_stream_open(
return nullptr;
}
+ /* Multi-backend seam (M3): an alternate in-process runtime (LiteRT-LM /
+ * MLX-CoreML) may serve this bundle. The selector returns nullptr with NO
+ * error to keep the in-tree llama.cpp path below; nullptr WITH an error is a
+ * hard env-select failure to propagate. */
+ {
+ char * sel_err = nullptr;
+ LlmBackendFactory * factory =
+ llm_backend_select(ctx->bundle_dir.c_str(), cfg, &sel_err);
+ if (!factory && sel_err) {
+ if (out_error) {
+ *out_error = sel_err;
+ } else {
+ eliza_inference_free_string(sel_err);
+ }
+ return nullptr;
+ }
+ if (factory) {
+ EliLlmStream * bstream = new (std::nothrow) EliLlmStream();
+ if (!bstream) {
+ eliza_set_error(out_error,
+ "[libelizainference] llm_stream_open: out of memory");
+ return nullptr;
+ }
+ bstream->ctx = ctx;
+ bstream->max_tokens = cfg->max_tokens > 0 ? cfg->max_tokens : 0;
+ bstream->backend = factory->open(ctx, cfg, out_error);
+ if (!bstream->backend) {
+ delete bstream;
+ return nullptr;
+ }
+ return bstream;
+ }
+ }
+
llama_model * model = nullptr;
{
std::lock_guard lock(ctx->llm_mutex);
@@ -2988,6 +3035,9 @@ int eliza_inference_llm_stream_prefill(
const int32_t * token_ids,
size_t num_tokens,
char ** out_error) {
+ if (stream && stream->backend) {
+ return stream->backend->prefill(token_ids, num_tokens, out_error);
+ }
if (!stream || (!stream->lctx && !stream->mtp)) {
eliza_set_error(out_error,
"[libelizainference] llm_stream_prefill: invalid session");
@@ -3056,6 +3106,11 @@ int eliza_inference_llm_stream_next(
if (drafter_accepted_out) *drafter_accepted_out = 0;
if (text_out && text_cap > 0) text_out[0] = '\0';
+ if (stream && stream->backend) {
+ return stream->backend->next(tokens_out, tokens_cap, num_tokens_out,
+ text_out, text_cap, drafter_drafted_out,
+ drafter_accepted_out, out_error);
+ }
if (!stream || (!stream->mtp && (!stream->lctx || !stream->sampler))) {
eliza_set_error(out_error,
"[libelizainference] llm_stream_next: invalid session");
@@ -3245,6 +3300,9 @@ int eliza_inference_llm_stream_next(
}
int eliza_inference_llm_stream_cancel(EliLlmStream * stream) {
+ if (stream && stream->backend) {
+ return stream->backend->cancel();
+ }
if (stream) {
stream->cancel.store(true, std::memory_order_release);
}
@@ -3255,6 +3313,9 @@ int eliza_inference_llm_stream_save_slot(
EliLlmStream * stream,
const char * filename,
char ** out_error) {
+ if (stream && stream->backend) {
+ return stream->backend->save_slot(filename, out_error);
+ }
(void) stream;
(void) filename;
/* v1: cross-launch slot KV persistence is not wired. Return a structured
@@ -3269,6 +3330,9 @@ int eliza_inference_llm_stream_restore_slot(
EliLlmStream * stream,
const char * filename,
char ** out_error) {
+ if (stream && stream->backend) {
+ return stream->backend->restore_slot(filename, out_error);
+ }
(void) stream;
(void) filename;
eliza_set_error(out_error,
@@ -3285,6 +3349,7 @@ int eliza_inference_llm_stream_reset(EliLlmStream * stream) {
* created/destroyed repeatedly. Handles both the plain fixed-KV stream and
* the MTP speculative engine (which owns its own target/draft KV). */
if (!stream) return ELIZA_ERR_INVALID_ARG;
+ if (stream->backend) return stream->backend->reset();
if (!stream->mtp && !stream->lctx) return ELIZA_ERR_INVALID_ARG;
if (stream->mtp) {
/* MTP stream: clear both the target and draft KV caches, reset the
@@ -3319,6 +3384,7 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep)
* separate (riskier) handling — prefix-reuse mode opens the resident stream
* without MTP, trading MTP's ~1.5x decode for the much larger prefill cut. */
if (!stream) return ELIZA_ERR_INVALID_ARG;
+ if (stream->backend) return stream->backend->reset_keep(n_keep);
if (stream->mtp || !stream->lctx) return ELIZA_ERR_INVALID_ARG;
if (n_keep < 0) n_keep = 0;
if (n_keep > stream->n_past) n_keep = stream->n_past;
@@ -3339,6 +3405,10 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep)
void eliza_inference_llm_stream_close(EliLlmStream * stream) {
if (!stream) return;
+ if (stream->backend) {
+ delete stream->backend;
+ stream->backend = nullptr;
+ }
if (stream->mtp) {
eliza_mtp::free_engine(stream->mtp);
stream->mtp = nullptr;
diff --git a/tools/omnivoice/src/llm-backend-selector.cpp b/tools/omnivoice/src/llm-backend-selector.cpp
new file mode 100644
index 000000000..fa5fa703c
--- /dev/null
+++ b/tools/omnivoice/src/llm-backend-selector.cpp
@@ -0,0 +1,140 @@
+/*
+ * llm-backend-selector.cpp — registry + selection for the multi-runtime
+ * streaming-LLM seam (cutover plan M3).
+ *
+ * On a default build (no -DELIZA_ENABLE_* gate) NO alternate backend is
+ * registered, so llm_backend_select() always returns nullptr and the FFI keeps
+ * the in-tree llama.cpp path. The seam is therefore inert-by-default: the
+ * library behaves exactly as before until an accelerator backend is compiled in.
+ */
+
+#include "llm-backend.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+/* Gated backend factory accessors. Declared only when the matching backend is
+ * compiled in; register_builtins() calls them under the same gate. Keeping the
+ * declarations gated means the default build has no unresolved symbols. */
+#ifdef ELIZA_ENABLE_LITERT
+LlmBackendFactory * litert_backend_factory();
+#endif
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+LlmBackendFactory * mlx_coreml_backend_factory();
+#endif
+
+namespace {
+
+std::mutex g_reg_mutex;
+std::vector g_factories;
+std::once_flag g_builtins_once;
+
+/* Heap-allocate an error string with malloc so the caller can release it with
+ * eliza_inference_free_string() (which calls free()), matching the FFI contract. */
+char * dup_error(const std::string & msg) {
+ char * out = (char *) std::malloc(msg.size() + 1);
+ if (out) std::memcpy(out, msg.c_str(), msg.size() + 1);
+ return out;
+}
+
+bool iequals(const char * a, const char * b) {
+ if (!a || !b) return false;
+ while (*a && *b) {
+ if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) {
+ return false;
+ }
+ ++a;
+ ++b;
+ }
+ return *a == *b;
+}
+
+bool is_llamacpp_name(const char * s) {
+ return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama");
+}
+
+} // namespace
+
+void llm_backend_register(LlmBackendFactory * factory) {
+ if (!factory) return;
+ std::lock_guard lock(g_reg_mutex);
+ for (LlmBackendFactory * f : g_factories) {
+ if (iequals(f->name(), factory->name())) return; /* idempotent by name */
+ }
+ g_factories.push_back(factory);
+}
+
+void llm_backend_register_builtins() {
+ std::call_once(g_builtins_once, []() {
+#ifdef ELIZA_ENABLE_LITERT
+ llm_backend_register(litert_backend_factory());
+#endif
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+ llm_backend_register(mlx_coreml_backend_factory());
+#endif
+ });
+}
+
+LlmBackendFactory * llm_backend_select(const char * bundle_dir,
+ const eliza_llm_stream_config_t * /*cfg*/,
+ char ** out_error) {
+ llm_backend_register_builtins();
+
+ /* (1) ELIZA_LLM_BACKEND env: a HARD select. */
+ const char * forced = std::getenv("ELIZA_LLM_BACKEND");
+ if (forced && forced[0] != '\0') {
+ if (is_llamacpp_name(forced)) {
+ return nullptr; /* force the in-tree path, not an error */
+ }
+ std::lock_guard lock(g_reg_mutex);
+ for (LlmBackendFactory * f : g_factories) {
+ if (!iequals(f->name(), forced)) continue;
+ if (!f->available()) {
+ if (out_error) {
+ *out_error = dup_error(
+ std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+ " is not available in this build/host");
+ }
+ return nullptr;
+ }
+ if (!f->can_serve(bundle_dir)) {
+ if (out_error) {
+ *out_error = dup_error(
+ std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+ " cannot serve the bundle at " +
+ (bundle_dir ? bundle_dir : "(null)"));
+ }
+ return nullptr;
+ }
+ return f;
+ }
+ if (out_error) {
+ *out_error = dup_error(
+ std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+ " is not a registered backend");
+ }
+ return nullptr;
+ }
+
+ /* (2) Auto-select: the highest preference_rank among available + can_serve.
+ * The in-tree llama.cpp path is the implicit rank-0 fallback, so an
+ * accelerator backend only wins when it returns a positive rank AND can
+ * serve this bundle. */
+ std::lock_guard lock(g_reg_mutex);
+ LlmBackendFactory * best = nullptr;
+ int best_rank = 0;
+ for (LlmBackendFactory * f : g_factories) {
+ if (!f->available()) continue;
+ if (!f->can_serve(bundle_dir)) continue;
+ const int rank = f->preference_rank();
+ if (rank > best_rank) {
+ best_rank = rank;
+ best = f;
+ }
+ }
+ return best; /* nullptr => in-tree llama.cpp */
+}
diff --git a/tools/omnivoice/src/llm-backend.h b/tools/omnivoice/src/llm-backend.h
new file mode 100644
index 000000000..0fad67f3c
--- /dev/null
+++ b/tools/omnivoice/src/llm-backend.h
@@ -0,0 +1,167 @@
+#pragma once
+/*
+ * llm-backend.h — multi-runtime streaming-LLM backend seam (cutover plan M3).
+ *
+ * The libelizainference streaming-LLM FFI (`eliza_inference_llm_stream_*`) is
+ * ONE pipe that can be driven by more than one in-process inference runtime:
+ *
+ * - llama.cpp — the default / reference backend (CPU / CUDA / Vulkan-Mali-
+ * Adreno / Metal). Always present; the in-tree code path.
+ * - LiteRT-LM — Android NPU (Tensor / Qualcomm QNN / MediaTek NeuroPilot),
+ * optionally desktop/iOS GPU. Gated -DELIZA_ENABLE_LITERT.
+ * - CoreML/MLX — Apple Silicon (mac first, iOS later). Gated -DELIZA_ENABLE_MLX.
+ *
+ * Per native/AGENTS.md §11 (reinterpreted by the Gemma-4 cutover): "one managed
+ * library, one pipe, no sidecar/subprocess/TCP." LiteRT-LM and MLX are
+ * EMBEDDABLE in-process C++ libraries linked INTO libelizainference and exposed
+ * behind the SAME FFI streaming symbols — never a child process or TCP server.
+ * (AICore / Apple Foundation stay opportunistic out-of-process adapters on the
+ * TS side, not owned backends — they are NOT registered here.)
+ *
+ * A backend supplies:
+ * - LlmBackendSession — the per-generation streaming session, mirroring the
+ * FFI pull contract (prefill -> next* -> close) 1:1 so
+ * the FFI functions delegate without translation.
+ * - LlmBackendFactory — names the runtime, reports availability + bundle fit,
+ * and opens sessions.
+ *
+ * `llm_backend_select()` picks a backend at `_open` time from the platform, the
+ * bundle contents, the build flags, and the `ELIZA_LLM_BACKEND` env override.
+ * When it returns nullptr (and no error) the FFI keeps the in-tree llama.cpp
+ * path — so a build with no alternate backend behaves exactly as before.
+ */
+
+#include "eliza-inference-ffi.h" /* eliza_llm_stream_config_t, EliInferenceContext fwd */
+
+#include
+#include
+
+/* Defined in the FFI translation unit. Opaque to backends — a backend reaches
+ * the resident model/bundle through the accessors below, not the struct. */
+struct EliInferenceContext;
+
+/* The bundle directory the context was opened against. A backend's open()
+ * resolves its own artifact under this root (e.g. `/text/*.litertlm`,
+ * `/text/*.mlpackage`) — the ONLY supported way to read the bundle path,
+ * since the struct itself is opaque here. Returns nullptr when ctx is null.
+ * Defined in eliza-inference-ffi.cpp; the pointer is owned by the context and
+ * stays valid for the session's lifetime. */
+const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx);
+
+/* ---- Per-generation streaming session ------------------------------------ *
+ *
+ * Lifetime: created by LlmBackendFactory::open(), destroyed via `delete` on the
+ * FFI `_close` path. Every method mirrors the matching FFI entry point so the
+ * FFI can `return session->method(...)` with no argument translation. Status
+ * conventions are identical to the FFI: >= 0 on success, the negative `ELIZA_*`
+ * constants on failure, with `*out_error` heap-allocated for the caller to free.
+ */
+struct LlmBackendSession {
+ virtual ~LlmBackendSession() = default;
+
+ /* Mirrors eliza_inference_llm_stream_prefill. Copies the tokens it needs. */
+ virtual int prefill(const int32_t * token_ids, size_t num_tokens,
+ char ** out_error) = 0;
+
+ /* Mirrors eliza_inference_llm_stream_next. Returns 0 (more output), 1 (final
+ * step — EOS / cap), or a negative ELIZA_* code (ELIZA_ERR_CANCELLED on
+ * cancel). `drafter_*_out` carry per-step speculative stats (0 when the
+ * backend has no drafter). */
+ virtual int next(int32_t * tokens_out, size_t tokens_cap,
+ size_t * num_tokens_out, char * text_out, size_t text_cap,
+ int32_t * drafter_drafted_out, int32_t * drafter_accepted_out,
+ char ** out_error) = 0;
+
+ /* Mirrors eliza_inference_llm_stream_cancel. Publishes a flag an in-flight
+ * next() checks at a step boundary; safe to call from another thread.
+ * Returns ELIZA_OK whether or not a pass was running. */
+ virtual int cancel() = 0;
+
+ /* Mirrors eliza_inference_llm_stream_reset: clear KV + sampler/counters so
+ * the next prefill starts a fresh prompt on the same warm session. */
+ virtual int reset() = 0;
+
+ /* Mirrors eliza_inference_llm_stream_reset_keep: keep the first `n_keep`
+ * tokens of state resident and drop the rest. Returns the n_keep actually
+ * applied (>= 0, may be clamped / 0 on a full-reset fallback). A backend
+ * that cannot do prefix reuse MUST fall back to a full reset and return 0 —
+ * never an error. */
+ virtual int reset_keep(int32_t n_keep) = 0;
+
+ /* Slot KV persistence — optional. Default: not supported. */
+ virtual int save_slot(const char * /*filename*/, char ** /*out_error*/) {
+ return ELIZA_ERR_INVALID_ARG;
+ }
+ virtual int restore_slot(const char * /*filename*/, char ** /*out_error*/) {
+ return ELIZA_ERR_INVALID_ARG;
+ }
+};
+
+/* ---- Backend factory (one per linked-in runtime) ------------------------- */
+struct LlmBackendFactory {
+ virtual ~LlmBackendFactory() = default;
+
+ /* Stable lower-case id: "llama.cpp", "litert-lm", "mlx-coreml". Matched
+ * case-insensitively against ELIZA_LLM_BACKEND. */
+ virtual const char * name() const = 0;
+
+ /* True only when this backend is compiled in AND its runtime dependencies
+ * are present on THIS host (the NPU delegate / Metal device / the linked
+ * lib). A scaffold whose build gate is OFF returns false. Cheap — must not
+ * load a model. */
+ virtual bool available() const = 0;
+
+ /* True when this backend can serve the bundle at `bundle_dir` — i.e. the
+ * backend-specific artifact exists (e.g. `text/*.litertlm`, `text/*.mlpackage`).
+ * Cheap directory probe, no model load. */
+ virtual bool can_serve(const char * bundle_dir) const = 0;
+
+ /* Platform-affinity rank used to order candidates when several can serve the
+ * same bundle and no env override is set. Higher wins. The in-tree llama.cpp
+ * path is rank 0 (the implicit fallback); an accelerator backend that is the
+ * preferred path on this device returns a positive value. */
+ virtual int preference_rank() const { return 0; }
+
+ /* Create a streaming session for (ctx, cfg). Returns nullptr + `*out_error`
+ * on failure. The returned session is owned by the caller (FFI `_close`
+ * deletes it). */
+ virtual LlmBackendSession * open(EliInferenceContext * ctx,
+ const eliza_llm_stream_config_t * cfg,
+ char ** out_error) = 0;
+};
+
+/* ---- Registry + selection ------------------------------------------------ *
+ *
+ * Backends register their singleton factory (idempotent; the registry does not
+ * take ownership — factories are static-lifetime singletons). The FFI
+ * translation unit calls llm_backend_register_builtins() once to register every
+ * compiled-in backend, then calls llm_backend_select() per `_open`.
+ */
+
+/* Register a factory (idempotent by name). Safe to call from static init. */
+void llm_backend_register(LlmBackendFactory * factory);
+
+/* Register every backend compiled into THIS build (gated by the -DELIZA_ENABLE_*
+ * CMake options). Idempotent; call once at first `_open`. Defined in
+ * llm-backend-selector.cpp; the gated backends self-register via their headers. */
+void llm_backend_register_builtins();
+
+/* Pick a backend for the bundle at `bundle_dir` with `cfg`. Resolution order:
+ *
+ * 1. ELIZA_LLM_BACKEND env (exact, case-insensitive backend name) — a HARD
+ * select. "llama.cpp" / "llamacpp" forces the in-tree path (returns
+ * nullptr, no error). Any other name that is not registered+available, or
+ * cannot serve the bundle, is a hard error: returns nullptr AND sets
+ * `*out_error` so the FFI aborts rather than silently using llama.cpp.
+ *
+ * 2. No env override: among registered backends that are available() AND
+ * can_serve(bundle_dir), pick the highest preference_rank(). If none
+ * qualifies, return nullptr (use the in-tree llama.cpp path).
+ *
+ * A nullptr return with `*out_error == nullptr` means "use the in-tree llama.cpp
+ * path" — NOT an error. A nullptr return with `*out_error != nullptr` is a hard
+ * failure the caller must propagate.
+ */
+LlmBackendFactory * llm_backend_select(const char * bundle_dir,
+ const eliza_llm_stream_config_t * cfg,
+ char ** out_error);