diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index cd9666a21..0eb9dad8a 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1990,7 +1990,19 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla
         uint32_t cell_range_begin = cells.size();
 
         for (uint32_t i = 0; i < cells.size(); ++i) {
-            if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+            bool add_cell = true;
+
+            add_cell = add_cell && !cells.is_empty(i);
+            add_cell = add_cell && (seq_id == -1 || cells.seq_has(i, seq_id));
+
+            // check the cell is not SWA-masked
+            if (add_cell && seq_id != -1) {
+                const bool is_masked = llama_hparams::is_masked_swa(n_swa, swa_type, cells.pos_get(i), cells.seq_pos_max(seq_id));
+
+                add_cell = !is_masked;
+            }
+
+            if (add_cell) {
                 ++cell_count;
                 if (cell_range_begin == cells.size()) {
                     cell_range_begin = i;
@@ -2246,7 +2258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
 
         sinfo = find_slot(ubatch, false);
         if (sinfo.empty()) {
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            LLAMA_LOG_ERROR("%s: failed to find %d available cells in kv cache\n", __func__,  cell_count);
             return false;
         }
 
diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
index 72088dfa0..6cb3e13a6 100644
--- a/tools/omnivoice/CMakeLists.txt
+++ b/tools/omnivoice/CMakeLists.txt
@@ -78,6 +78,11 @@ set(OMNIVOICE_CORE_SOURCES
 # llama + mtmd into a single ABI-stable C surface.
 set(OMNIVOICE_FFI_SOURCES
     src/eliza-inference-ffi.cpp
+    # Multi-runtime streaming-LLM backend seam (cutover plan M3). The selector
+    # is always compiled; it is inert until a -DELIZA_ENABLE_* accelerator
+    # backend below registers itself, so the default build keeps the in-tree
+    # llama.cpp path.
+    src/llm-backend-selector.cpp
 )
 
 # Vendored standalone voice-classifier forward graphs (pure scalar C, no
@@ -220,6 +225,19 @@ endif()
 # (the fused build links mtmd anyway), opt out with -DELIZA_ENABLE_VISION=OFF.
 option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
 
+# ELIZA_ENABLE_LITERT — compile the LiteRT-LM in-process streaming-LLM backend
+# (cutover plan M4 — Android NPU: Tensor / Qualcomm QNN / MediaTek NeuroPilot).
+# OFF by default: the selector registers no LiteRT backend and the streaming-LLM
+# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
+# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
+# default. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF)
+
+# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
+# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
+# requires the MLX / CoreML toolchain. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_MLX "Build the CoreML/MLX in-process LLM backend (M5)" OFF)
+
 if(TARGET mtmd)
     add_library(elizainference SHARED
         ${OMNIVOICE_CORE_SOURCES}
@@ -271,6 +289,48 @@ if(TARGET mtmd)
             ${CMAKE_CURRENT_SOURCE_DIR}/../kokoro/include)
         target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_KOKORO)
     endif()
+    # ── Multi-runtime streaming-LLM accelerator backends (cutover M4/M5) ────
+    # The M3 selector (src/llm-backend-selector.cpp) is always compiled in via
+    # OMNIVOICE_FFI_SOURCES. These accelerator backends each link an external
+    # SDK, so they are opt-in. When a gate is OFF its source is not compiled,
+    # the selector's `#ifdef`-guarded factory declaration + registration drop
+    # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
+    # default desktop/CI build is byte-for-byte the pre-seam behavior.
+    if(ELIZA_ENABLE_LITERT)
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
+        # LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built
+        # SDK with -DELIZA_LITERT_SDK_DIR=<dir>; the device/host cross-build
+        # links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=<libs>.
+        if(ELIZA_LITERT_SDK_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
+        endif()
+        if(ELIZA_LITERT_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
+        endif()
+    endif()
+    if(ELIZA_ENABLE_MLX)
+        if(NOT APPLE)
+            message(FATAL_ERROR
+                "ELIZA_ENABLE_MLX requires an Apple host (CoreML/MLX are Apple-only).")
+        endif()
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/mlx-coreml-backend.mm)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_MLX)
+        # MLX C API (ml-explore/mlx-c) via -DELIZA_MLX_C_DIR / -DELIZA_MLX_LIBS,
+        # plus the system CoreML / Metal / Foundation frameworks.
+        if(ELIZA_MLX_C_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/lib)
+        endif()
+        if(ELIZA_MLX_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_MLX_LIBS})
+        endif()
+        target_link_libraries(elizainference PRIVATE
+            "-framework Foundation" "-framework CoreML" "-framework Metal")
+    endif()
     set_target_properties(elizainference PROPERTIES
         OUTPUT_NAME              elizainference
         POSITION_INDEPENDENT_CODE ON)
diff --git a/tools/omnivoice/src/backends/litert-backend.cpp b/tools/omnivoice/src/backends/litert-backend.cpp
new file mode 100644
index 000000000..3b3dad137
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-backend.cpp
@@ -0,0 +1,471 @@
+/*
+ * litert-backend.cpp — LiteRT-LM in-process streaming-LLM backend (M4).
+ *
+ * See litert-backend.h for the targeted LiteRT-LM C++ API (repo + commit
+ * date cited there). The real implementation is gated behind
+ * `ELIZA_ENABLE_LITERT`; the default (Linux/desktop) build compiles the stub
+ * branch, which links zero LiteRT-LM SDK headers and reports
+ * `available() == false` so the selector keeps the in-tree llama.cpp path.
+ *
+ * Error contract (native/AGENTS.md §3 + §9): never log, never return a
+ * defaulted result on failure. Every failure path heap-allocates `*out_error`
+ * via litert_set_error() (matching the FFI cpp's eliza_strdup/eliza_set_error
+ * style) and returns the negative ELIZA_* code or nullptr.
+ */
+
+#include "litert-backend.h"
+
+#include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+
+#if defined(__has_include)
+#  if __has_include(<filesystem>)
+#    include <filesystem>
+#    define LITERT_HAVE_FILESYSTEM 1
+#  endif
+#endif
+
+/* ── Heap-allocated error strings (mirror eliza-inference-ffi.cpp) ───────── */
+namespace {
+
+char * litert_strdup(const std::string & s) {
+    char * out = static_cast<char *>(std::malloc(s.size() + 1));
+    if (!out) return nullptr;
+    std::memcpy(out, s.c_str(), s.size() + 1);
+    return out;
+}
+
+void litert_set_error(char ** out_error, const std::string & msg) {
+    if (!out_error) return;
+    *out_error = litert_strdup(msg);
+}
+
+#if defined(LITERT_HAVE_FILESYSTEM)
+/* Probe <bundle_dir>/text/ for a *.litertlm artifact. Cheap directory walk,
+ * no model load (LlmBackendFactory::can_serve contract). */
+std::string find_litertlm_artifact(const char * bundle_dir) {
+    if (!bundle_dir || bundle_dir[0] == '\0') return std::string();
+    std::error_code ec;
+    std::filesystem::path text_dir =
+        std::filesystem::path(bundle_dir) / LITERT_BUNDLE_TEXT_SUBDIR;
+    if (!std::filesystem::is_directory(text_dir, ec)) return std::string();
+    for (std::filesystem::directory_iterator it(text_dir, ec), end;
+         !ec && it != end; it.increment(ec)) {
+        if (!it->is_regular_file(ec)) continue;
+        if (it->path().extension() == LITERT_ARTIFACT_EXT) {
+            return it->path().string();
+        }
+    }
+    return std::string();
+}
+#else
+std::string find_litertlm_artifact(const char *) { return std::string(); }
+#endif
+
+}  // namespace
+
+/* ════════════════════════════════════════════════════════════════════════ *
+ *  REAL implementation — only when ELIZA_ENABLE_LITERT is defined.
+ *  Behind this gate we may include LiteRT-LM SDK headers; outside it we
+ *  include NONE so the file builds on a host without the SDK.
+ * ════════════════════════════════════════════════════════════════════════ */
+#ifdef ELIZA_ENABLE_LITERT
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <variant>
+#include <vector>
+
+/* LiteRT-LM cross-platform C++ runtime. Paths per the repo's bazel layout
+ * (github.com/google-ai-edge/LiteRT-LM, `main`, researched 2026-06-22). */
+#include "runtime/engine/engine.h"          // litert::lm::Engine, SessionInterface
+#include "runtime/engine/engine_settings.h" // EngineSettings, SessionConfig, ModelAssets
+#include "runtime/engine/io_types.h"        // InputData, InputText, Responses
+
+namespace {
+
+using litert::lm::Backend;
+using litert::lm::Engine;
+using litert::lm::EngineSettings;
+using litert::lm::InputData;
+using litert::lm::InputText;
+using litert::lm::ModelAssets;
+using litert::lm::Responses;
+using litert::lm::SessionConfig;
+
+/* The Session type the templated Engine hands back (Engine::Session is the
+ * public alias EngineT<SessionT> exposes; for Engine it is SessionInterface). */
+using Session = Engine::Session;
+
+/* The accelerator the factory resolved at open(), recorded for diagnostics
+ * and preference reporting. DEVICE-VERIFY: which rung actually initializes is
+ * hardware-dependent and can only be confirmed on an NPU/GPU device. */
+enum class ResolvedAccelerator { kNone, kNpu, kGpu, kCpu };
+
+const char * accelerator_name(ResolvedAccelerator a) {
+    switch (a) {
+        case ResolvedAccelerator::kNpu: return "npu";
+        case ResolvedAccelerator::kGpu: return "gpu";
+        case ResolvedAccelerator::kCpu: return "cpu";
+        default:                        return "none";
+    }
+}
+
+/* Try to build an Engine for `artifact` on `backend`. Returns the Engine on
+ * success; on failure returns nullptr (the ladder falls through to the next
+ * rung). The error text is captured so the final rung can surface it. */
+std::unique_ptr<Engine> try_engine(const std::string & artifact,
+                                   Backend backend,
+                                   std::string & last_err) {
+    auto model_assets = ModelAssets::Create(artifact);
+    if (!model_assets.ok()) {
+        last_err = std::string(model_assets.status().message());
+        return nullptr;
+    }
+    auto settings = EngineSettings::CreateDefault(*model_assets, backend);
+    if (!settings.ok()) {
+        last_err = std::string(settings.status().message());
+        return nullptr;
+    }
+    auto engine = Engine::CreateEngine(*settings);
+    if (!engine.ok()) {
+        last_err = std::string(engine.status().message());
+        return nullptr;
+    }
+    return std::move(*engine);
+}
+
+/* ── Session: mirrors the FFI streaming pull contract 1:1 ────────────────── */
+class LiteRtBackendSession final : public LlmBackendSession {
+public:
+    LiteRtBackendSession(std::unique_ptr<Engine> engine,
+                         std::unique_ptr<Session> session,
+                         const eliza_llm_stream_config_t & cfg,
+                         ResolvedAccelerator accel)
+        : engine_(std::move(engine)),
+          session_(std::move(session)),
+          accel_(accel),
+          max_tokens_(cfg.max_tokens > 0 ? cfg.max_tokens : 0) {}
+
+    /* prefill: copy the caller's tokens, detokenize through the engine's
+     * tokenizer, and run a LiteRT prefill pass. The FFI hands pre-tokenized
+     * ids (text-model vocab); LiteRT-LM's prefill consumes InputData (text),
+     * so we round-trip ids → text via the shared tokenizer rather than
+     * assuming vocab parity (the .litertlm graph carries its own tokenizer).
+     * DEVICE-VERIFY: id/text round-trip fidelity needs a real .litertlm. */
+    int prefill(const int32_t * token_ids, size_t num_tokens,
+                char ** out_error) override {
+        if (!session_) {
+            litert_set_error(out_error,
+                "[litert-lm] prefill: session is not open");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancelled_.load(std::memory_order_acquire)) {
+            return ELIZA_ERR_CANCELLED;
+        }
+        std::vector<int> ids;
+        ids.reserve(num_tokens);
+        for (size_t i = 0; i < num_tokens; ++i) ids.push_back(token_ids[i]);
+
+        const std::string text = engine_->GetTokenizer().Detokenize(ids);
+        std::vector<InputData> contents;
+        contents.emplace_back(InputText(std::string(text)));
+
+        absl::Status st = session_->RunPrefill(contents);
+        if (!st.ok()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] RunPrefill failed: ") +
+                std::string(st.message()));
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        prefilled_ = true;
+        return ELIZA_OK;
+    }
+
+    /* next: one decode step. LiteRT-LM's RunDecode() returns a Responses
+     * batch; we emit the newly-produced UTF-8 delta as detokenized text and
+     * its token ids. LiteRT-LM has no in-process MTP drafter exposed through
+     * this surface, so drafted/accepted are always 0. Returns 1 (final) at
+     * EOS or the max-token cap, 0 otherwise. */
+    int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+             char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+             int32_t * drafter_accepted_out, char ** out_error) override {
+        if (num_tokens_out) *num_tokens_out = 0;
+        if (text_out && text_cap) text_out[0] = '\0';
+        if (drafter_drafted_out)  *drafter_drafted_out = 0;
+        if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+        if (!session_) {
+            litert_set_error(out_error, "[litert-lm] next: session not open");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (!prefilled_) {
+            litert_set_error(out_error,
+                "[litert-lm] next: prefill must run before next");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancelled_.load(std::memory_order_acquire)) {
+            return ELIZA_ERR_CANCELLED;
+        }
+
+        auto responses = session_->RunDecode();
+        if (!responses.ok()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] RunDecode failed: ") +
+                std::string(responses.status().message()));
+            return ELIZA_ERR_FFI_FAULT;
+        }
+
+        /* RunDecode yields the running candidate texts; GetTexts()[0] is the
+         * cumulative decode for candidate 0. Emit only the suffix produced
+         * since the last step so the FFI streams a delta per pull. */
+        const std::vector<std::string> & texts = responses->GetTexts();
+        std::string cumulative = texts.empty() ? std::string() : texts.front();
+        std::string delta = compute_delta(cumulative);
+        emitted_chars_ = cumulative.size();
+
+        /* Re-tokenize the delta against the engine tokenizer so the FFI gets
+         * committed text-vocab ids (the same round-trip the prefill used). */
+        std::vector<int> delta_ids = engine_->GetTokenizer().Tokenize(delta);
+        size_t n_emit = delta_ids.size();
+        if (n_emit > tokens_cap) n_emit = tokens_cap;
+        if (tokens_out) {
+            for (size_t i = 0; i < n_emit; ++i) {
+                tokens_out[i] = static_cast<int32_t>(delta_ids[i]);
+            }
+        }
+        if (num_tokens_out) *num_tokens_out = n_emit;
+        if (text_out && text_cap) {
+            const size_t copy = delta.size() < text_cap - 1
+                                    ? delta.size()
+                                    : text_cap - 1;
+            std::memcpy(text_out, delta.data(), copy);
+            text_out[copy] = '\0';
+        }
+
+        decoded_tokens_ += static_cast<int32_t>(delta_ids.size());
+        const bool hit_cap =
+            max_tokens_ > 0 && decoded_tokens_ >= max_tokens_;
+        /* DEVICE-VERIFY: the precise EOS signal LiteRT-LM exposes per step is
+         * runtime-version-dependent. A done decode yields no new delta; treat
+         * an empty delta or the token cap as the final step. */
+        const bool eos = delta_ids.empty();
+        return (hit_cap || eos) ? 1 : 0;
+    }
+
+    /* cancel: publish a flag the next decode step observes. Thread-safe. */
+    int cancel() override {
+        cancelled_.store(true, std::memory_order_release);
+        return ELIZA_OK;
+    }
+
+    /* reset: drop a fresh Session from the same Engine (clears KV + sampler).
+     * Reuses the warm Engine (model weights stay resident) — only the
+     * per-generation Session is rebuilt. */
+    int reset() override {
+        auto cfg = SessionConfig::CreateDefault();
+        auto session = engine_->CreateSession(cfg);
+        if (!session.ok()) {
+            /* reset has no out_error param; a failed rebuild leaves the old
+             * session in place and surfaces on the next prefill/next. */
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        session_ = std::move(*session);
+        cancelled_.store(false, std::memory_order_release);
+        prefilled_ = false;
+        decoded_tokens_ = 0;
+        emitted_chars_ = 0;
+        return ELIZA_OK;
+    }
+
+    /* reset_keep: LiteRT-LM's Session does not expose prefix-preserving KV
+     * trimming through this surface, so fall back to a full reset and return 0
+     * (no prefix kept) — never an error (llm-backend.h contract). */
+    int reset_keep(int32_t /*n_keep*/) override {
+        reset();
+        return 0;
+    }
+
+    const char * accelerator() const { return accelerator_name(accel_); }
+
+private:
+    /* The suffix of `cumulative` produced since the last emitted step. */
+    std::string compute_delta(const std::string & cumulative) const {
+        if (cumulative.size() <= emitted_chars_) return std::string();
+        return cumulative.substr(emitted_chars_);
+    }
+
+    std::unique_ptr<Engine>  engine_;
+    std::unique_ptr<Session> session_;
+    std::atomic<bool>        cancelled_{false};
+    bool                     prefilled_ = false;
+    int32_t                  decoded_tokens_ = 0;
+    size_t                   emitted_chars_ = 0;
+    ResolvedAccelerator      accel_ = ResolvedAccelerator::kNone;
+    int32_t                  max_tokens_ = 0;
+};
+
+/* ── Factory ─────────────────────────────────────────────────────────────── */
+class LiteRtBackendFactory final : public LlmBackendFactory {
+public:
+    const char * name() const override { return LITERT_BACKEND_NAME; }
+
+    /* available(): compiled in AND an accelerator (NPU or GPU) initializes on
+     * THIS host. Cheap — must not load a model. We probe by building a minimal
+     * EngineSettings on NPU then GPU with NO model assets; a backend whose
+     * delegate is missing fails settings validation. CPU alone does NOT make
+     * this backend "available" (CPU is the in-tree llama.cpp path's job).
+     * DEVICE-VERIFY: real delegate presence is only knowable on-device. */
+    bool available() const override {
+        return probe_accelerator() != ResolvedAccelerator::kNone;
+    }
+
+    /* can_serve(): a *.litertlm exists under <bundle_dir>/text/. Cheap probe,
+     * no caching — open() re-resolves the bundle from the context accessor. */
+    bool can_serve(const char * bundle_dir) const override {
+        return !find_litertlm_artifact(bundle_dir).empty();
+    }
+
+    /* preference_rank(): high on Android NPU (the whole reason this backend
+     * exists), modest on a GPU-only fallback, 0 otherwise so llama.cpp wins. */
+    int preference_rank() const override {
+        switch (probe_accelerator()) {
+            case ResolvedAccelerator::kNpu: return 100;
+            case ResolvedAccelerator::kGpu: return 20;
+            default:                        return 0;
+        }
+    }
+
+    /* open(): resolve the .litertlm under the cached bundle, then walk the
+     * accelerator ladder NPU → GPU → CPU, recording which rung built the
+     * Engine. Builds a default Session and returns the streaming session. */
+    LlmBackendSession * open(EliInferenceContext * ctx,
+                             const eliza_llm_stream_config_t * cfg,
+                             char ** out_error) override {
+        if (!cfg) {
+            litert_set_error(out_error, "[litert-lm] open: cfg is NULL");
+            return nullptr;
+        }
+        const char * bundle_dir = llm_backend_context_bundle_dir(ctx);
+        const std::string bundle = bundle_dir ? bundle_dir : std::string();
+        std::string artifact = find_litertlm_artifact(bundle.c_str());
+        if (artifact.empty()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] open: no ") + LITERT_ARTIFACT_EXT +
+                " artifact under " + bundle + "/" + LITERT_BUNDLE_TEXT_SUBDIR);
+            return nullptr;
+        }
+
+        /* Accelerator ladder — NPU first (Qualcomm QNN / MediaTek NeuroPilot /
+         * Google Tensor), then GPU (OpenCL/Metal/WebGPU), then CPU (XNNPACK).
+         * Each rung's failure text is preserved for the final diagnostic.
+         * DEVICE-VERIFY: rung availability is hardware-specific. */
+        struct Rung { Backend backend; ResolvedAccelerator accel; };
+        const Rung ladder[] = {
+            {Backend::NPU, ResolvedAccelerator::kNpu},
+            {Backend::GPU, ResolvedAccelerator::kGpu},
+            {Backend::CPU, ResolvedAccelerator::kCpu},
+        };
+
+        std::unique_ptr<Engine> engine;
+        ResolvedAccelerator resolved = ResolvedAccelerator::kNone;
+        std::string last_err;
+        for (const Rung & rung : ladder) {
+            engine = try_engine(artifact, rung.backend, last_err);
+            if (engine) {
+                resolved = rung.accel;
+                break;
+            }
+        }
+        if (!engine) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] open: no accelerator could build the "
+                            "engine (last error: ") + last_err + ")");
+            return nullptr;
+        }
+
+        auto session_cfg = SessionConfig::CreateDefault();
+        auto session = engine->CreateSession(session_cfg);
+        if (!session.ok()) {
+            litert_set_error(out_error,
+                std::string("[litert-lm] open: CreateSession failed on ") +
+                accelerator_name(resolved) + ": " +
+                std::string(session.status().message()));
+            return nullptr;
+        }
+
+        return new LiteRtBackendSession(std::move(engine), std::move(*session),
+                                        *cfg, resolved);
+    }
+
+private:
+    /* Build a no-model EngineSettings on NPU then GPU; the first whose
+     * delegate validates marks that rung present. Result is memoized so the
+     * repeated available()/preference_rank() calls are cheap.
+     * DEVICE-VERIFY: settings-only validation is the cheapest honest probe;
+     * the true delegate handshake happens at open() on-device. */
+    ResolvedAccelerator probe_accelerator() const {
+        std::call_once(probe_once_, [this]() {
+            auto empty = ModelAssets::Create(std::string());
+            if (!empty.ok()) { probed_ = ResolvedAccelerator::kNone; return; }
+            if (EngineSettings::CreateDefault(*empty, Backend::NPU).ok()) {
+                probed_ = ResolvedAccelerator::kNpu;
+            } else if (EngineSettings::CreateDefault(*empty, Backend::GPU).ok()) {
+                probed_ = ResolvedAccelerator::kGpu;
+            } else {
+                probed_ = ResolvedAccelerator::kNone;
+            }
+        });
+        return probed_;
+    }
+
+    mutable std::once_flag      probe_once_;
+    mutable ResolvedAccelerator probed_ = ResolvedAccelerator::kNone;
+};
+
+}  // namespace
+
+LlmBackendFactory * litert_backend_factory() {
+    static LiteRtBackendFactory factory;
+    return &factory;
+}
+
+#else  /* ────────────────────────── STUB (no LiteRT-LM SDK) ──────────────── */
+
+/*
+ * Compiled-out stub: zero LiteRT-LM headers, so this builds on any host. The
+ * factory links in as a no-op — available() is false, can_serve() is false,
+ * preference_rank() is 0, and open() returns nullptr + sets `*out_error`
+ * "not compiled in" so the selector cleanly keeps the in-tree llama.cpp path.
+ */
+namespace {
+
+class LiteRtBackendFactoryStub final : public LlmBackendFactory {
+public:
+    const char * name() const override { return LITERT_BACKEND_NAME; }
+    bool available() const override { return false; }
+    bool can_serve(const char * /*bundle_dir*/) const override { return false; }
+    int preference_rank() const override { return 0; }
+
+    LlmBackendSession * open(EliInferenceContext * /*ctx*/,
+                             const eliza_llm_stream_config_t * /*cfg*/,
+                             char ** out_error) override {
+        litert_set_error(out_error,
+            "[litert-lm] backend not compiled in "
+            "(build with -DELIZA_ENABLE_LITERT to enable the LiteRT-LM NPU path)");
+        return nullptr;
+    }
+};
+
+}  // namespace
+
+LlmBackendFactory * litert_backend_factory() {
+    static LiteRtBackendFactoryStub factory;
+    return &factory;
+}
+
+#endif  /* ELIZA_ENABLE_LITERT */
diff --git a/tools/omnivoice/src/backends/litert-backend.h b/tools/omnivoice/src/backends/litert-backend.h
new file mode 100644
index 000000000..9096b64d0
--- /dev/null
+++ b/tools/omnivoice/src/backends/litert-backend.h
@@ -0,0 +1,73 @@
+#pragma once
+/*
+ * litert-backend.h — LiteRT-LM in-process streaming-LLM backend (cutover plan M4).
+ *
+ * Implements the M3 backend seam (`llm-backend.h`) on top of Google's
+ * LiteRT-LM C++ inference runtime, the in-process path for the Android NPU
+ * tier (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor), with an
+ * optional desktop/iOS GPU fallback. LiteRT-LM is linked INTO
+ * `libelizainference` and exposed behind the same FFI streaming symbols —
+ * never a child process or TCP server (native/AGENTS.md §11, gemma4 cutover).
+ *
+ * The whole real implementation is gated behind the CMake define
+ * `ELIZA_ENABLE_LITERT`. When that flag is OFF this header pulls in NO
+ * LiteRT-LM SDK headers, so the file compiles on a host without the SDK and
+ * the factory links in as a no-op: `available()` is false and `open()`
+ * returns nullptr + sets `*out_error` "not compiled in".
+ *
+ * ── Targeted runtime API (researched 2026-06-22) ──────────────────────────
+ * Repo:    https://github.com/google-ai-edge/LiteRT-LM  (`main`)
+ * Docs:    https://developers.google.com/edge/litert-lm/cpp
+ *          https://ai.google.dev/edge/litert/next/litert_lm_npu
+ * Namespace: `litert::lm`
+ *
+ * Symbols this backend targets (verbatim from the headers above):
+ *   - runtime/engine/engine.h
+ *       using Engine = EngineT<SessionInterface>;
+ *       static absl::StatusOr<std::unique_ptr<Engine>>
+ *           Engine::CreateEngine(const EngineSettings&);
+ *       absl::StatusOr<std::unique_ptr<SessionT>>
+ *           EngineT::CreateSession(const SessionConfig&);
+ *   - runtime/engine/engine.h  (SessionInterface)
+ *       absl::Status        RunPrefill(const std::vector<InputData>&);
+ *       absl::StatusOr<Responses> RunDecode();
+ *       absl::StatusOr<Responses> RunDecode(const DecodeConfig&);
+ *       absl::Status        GenerateContentStream(
+ *                               const std::vector<InputData>&,
+ *                               absl::AnyInvocable<void(absl::StatusOr<Responses>)>);
+ *   - runtime/engine/engine_settings.h
+ *       static absl::StatusOr<EngineSettings> EngineSettings::CreateDefault(
+ *           ModelAssets, Backend backend = Backend::CPU,
+ *           std::optional<Backend> vision_backend  = std::nullopt,
+ *           std::optional<Backend> audio_backend   = std::nullopt,
+ *           std::optional<Backend> sampler_backend = std::nullopt);
+ *       static SessionConfig SessionConfig::CreateDefault();
+ *       absl::StatusOr<ModelAssets> ModelAssets::Create(<path>);   // .litertlm
+ *   - runtime/engine/io_types.h
+ *       using InputData = std::variant<InputText, InputImage, InputAudio, ...>;
+ *       class InputText { explicit InputText(std::variant<std::string, TensorBuffer>); };
+ *       class Responses  { const std::vector<std::string>& GetTexts() const; };
+ *   - runtime/proto/engine.pb.h
+ *       enum Backend { ... CPU, GPU, NPU, ... };   // litert::lm::Backend
+ *
+ * Accelerator ladder (Android NPU first): the factory tries NPU, then GPU,
+ * then CPU at `open()` and records which one initialized. Every
+ * hardware-gated assumption is tagged `DEVICE-VERIFY` in the .cpp — the
+ * accelerator ladder, the .litertlm graph fit, and tok/s can only be
+ * confirmed on a real NPU device, which this scaffold does not have.
+ */
+
+#include "../llm-backend.h"
+
+/* Stable id matched case-insensitively against ELIZA_LLM_BACKEND, and the
+ * subdir + artifact extension the factory probes under <bundle_dir>/text/. */
+#define LITERT_BACKEND_NAME "litert-lm"
+#define LITERT_BUNDLE_TEXT_SUBDIR "text"
+#define LITERT_ARTIFACT_EXT ".litertlm"
+
+/* Singleton factory accessor. The selector (llm-backend-selector.cpp) calls
+ * this from `llm_backend_register_builtins()` to register the backend. The
+ * returned pointer is a static-lifetime singleton the registry does not own.
+ * Defined unconditionally — a build without ELIZA_ENABLE_LITERT returns a
+ * stub factory whose available() is false. */
+LlmBackendFactory * litert_backend_factory();
diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.h b/tools/omnivoice/src/backends/mlx-coreml-backend.h
new file mode 100644
index 000000000..36d048c00
--- /dev/null
+++ b/tools/omnivoice/src/backends/mlx-coreml-backend.h
@@ -0,0 +1,128 @@
+#pragma once
+/*
+ * mlx-coreml-backend.h — Apple-Silicon in-process streaming-LLM backend
+ * (Gemma-4 cutover plan M5). One of the alternate `LlmBackendSession` /
+ * `LlmBackendFactory` implementations behind the multi-runtime FFI seam
+ * defined in `../llm-backend.h` (cutover plan M3).
+ *
+ * Per native/AGENTS.md §11 ("one managed library, one pipe, no
+ * sidecar/subprocess/TCP") this backend is COMPILED INTO libelizainference
+ * and exposes the SAME `eliza_inference_llm_stream_*` FFI pull contract —
+ * it is the owned backend on Apple Silicon (mac first, iOS later), never a
+ * child process. Apple Foundation Models stays an opportunistic out-of-
+ * process adapter on the TS side and is NOT registered here.
+ *
+ * ── Two runtimes, one backend ─────────────────────────────────────────────
+ *
+ * The same `mlx-coreml` factory can serve a bundle through EITHER of two
+ * Apple on-device runtimes, picked at open() time from the artifact present
+ * under `<bundle_dir>/text/`:
+ *
+ *   • MLX  (PRIMARY)   — Apple's array framework for Apple Silicon. We drive
+ *                        it through the C API `mlx-c` (ml-explore/mlx-c). The
+ *                        text weights are an `mlx` weights dir (safetensors,
+ *                        the mlx-lm convention) OR a `*.gguf` MLX reads via
+ *                        `mlx_load_gguf`. Decode runs the transformer graph
+ *                        on the Metal GPU stream with `mlx_quantized_matmul`
+ *                        for the quantized weight banks,
+ *                        `mlx_fast_scaled_dot_product_attention` for
+ *                        attention, and `mlx_fast_rope` for position. The KV
+ *                        cache is a pair of resident `mlx_array`s we append to
+ *                        per step (host-side cache handle, GPU-resident data).
+ *                        This is the preferred path: it gives us full control
+ *                        of the sampler, supports the Gemma SWA/shared-KV
+ *                        geometry, and matches mlx-lm's published Gemma graph.
+ *
+ *   • CoreML (ALTERNATE) — Apple's MLModel runtime, which can place the graph
+ *                        on the ANE (Apple Neural Engine) as well as GPU/CPU.
+ *                        We load a compiled `*.mlmodelc` / `*.mlpackage`
+ *                        decoder and use the iOS-18 / macOS-15 **stateful**
+ *                        prediction API (`MLState`) so the KV cache lives
+ *                        inside CoreML and is updated in-place across decode
+ *                        steps (no per-token KV tensor marshalled across the
+ *                        ObjC boundary). CoreML needs Objective-C, which is
+ *                        why this whole backend is a `.mm` translation unit.
+ *
+ *   TRADE-OFF (documented per the task brief): MLX is the primary path
+ *   because it is the most flexible (custom sampler, exact Gemma geometry,
+ *   speculative-decode-ready) and tracks mlx-lm directly; its decode runs on
+ *   the GPU stream, not the ANE. CoreML's stateful MLModel can target the ANE
+ *   for lower power on phones, but the decoder graph must be pre-compiled
+ *   ahead of time, the sampler/KV layout is fixed by the converted model, and
+ *   ANE placement of large attention graphs is fragile across OS revisions.
+ *   We prefer MLX on mac/dev; CoreML is the alternate for ANE-bound iOS tiers
+ *   once a stateful decoder package is published. open() selects MLX when an
+ *   mlx weights dir / gguf is present, else falls back to the CoreML package.
+ *
+ * ── Build gate ────────────────────────────────────────────────────────────
+ *
+ * The REAL implementation is gated behind `ELIZA_ENABLE_MLX` (the CMake
+ * define for this backend, per the cutover plan: LiteRT → ELIZA_ENABLE_LITERT,
+ * MLX/CoreML → ELIZA_ENABLE_MLX) AND `__APPLE__`. When the gate is OFF the
+ * translation unit includes NO Apple/MLX SDK headers, so it compiles on a
+ * plain Linux host: `available()` returns false, `can_serve()` returns false,
+ * and `open()` returns nullptr after setting `*out_error` ("not compiled in").
+ * The default Linux build links it as a pure no-op and the selector skips it,
+ * keeping the in-tree llama.cpp path.
+ *
+ * ── API research (cited; symbols verified, not invented) ──────────────────
+ *
+ *   MLX C API — ml-explore/mlx-c, `mlx/c/` headers, main @ 2026-06 (docs MLX C
+ *   0.4.1, https://ml-explore.github.io/mlx-c/). Symbols used by the real path:
+ *     - device.h : `mlx_device mlx_device_new_type(mlx_device_type, int)` with
+ *                  `typedef enum { MLX_CPU, MLX_GPU } mlx_device_type;`
+ *     - stream.h : `mlx_stream mlx_default_gpu_stream_new(void)`,
+ *                  `mlx_stream mlx_default_cpu_stream_new(void)`
+ *     - io.h     : `int mlx_load_safetensors(mlx_map_string_to_array*,
+ *                  mlx_map_string_to_string*, const char* file, mlx_stream)`,
+ *                  `int mlx_load_gguf(mlx_io_gguf*, const char* file, mlx_stream)`
+ *     - array.h  : `mlx_array mlx_array_new_data(const void*, const int* shape,
+ *                  int dim, mlx_dtype)`, `int mlx_array_eval(mlx_array)`,
+ *                  `int mlx_array_item_int32(int32_t*, mlx_array)`,
+ *                  `const float* mlx_array_data_float32(mlx_array)`,
+ *                  `int mlx_array_free(mlx_array)`
+ *     - ops.h    : `int mlx_quantized_matmul(mlx_array*, x, w, scales, biases,
+ *                  bool transpose, mlx_optional_int group_size,
+ *                  mlx_optional_int bits, const char* mode, mlx_stream)`,
+ *                  `int mlx_matmul(...)`, `int mlx_softmax_axes(...)`,
+ *                  `int mlx_argmax_axis(mlx_array*, a, int axis, bool, stream)`,
+ *                  `int mlx_take(mlx_array*, a, indices, stream)`,
+ *                  `int mlx_astype(...)`, `int mlx_concatenate(...)`
+ *     - fast.h   : `int mlx_fast_scaled_dot_product_attention(mlx_array*, q, k,
+ *                  v, float scale, const char* mask_mode, mlx_array mask,
+ *                  mlx_array sinks, mlx_stream)`,
+ *                  `int mlx_fast_rope(mlx_array*, x, int dims, bool traditional,
+ *                  mlx_optional_float base, float scale, int offset,
+ *                  mlx_array freqs, mlx_stream)`
+ *   Gemma on MLX: ml-explore/mlx-lm (`mlx_lm/models/gemma*.py`) — the reference
+ *   for the dense SWA + shared-KV + dual-head-dim graph this backend mirrors.
+ *
+ *   CoreML stateful KV-cache — Apple Core ML, MLState API, macOS 15 / iOS 18
+ *   (WWDC24 "Bring your ML and AI models to Apple silicon"; coremltools
+ *   Stateful Models guide, https://apple.github.io/coremltools/docs-guides/
+ *   source/stateful-models.html). ObjC symbols used:
+ *     - `+ (nullable instancetype)modelWithContentsOfURL:(NSURL*)url
+ *        error:(NSError**)error;`  (and the compiled-model `compileModelAtURL:`)
+ *     - `- (MLState*)newState;`    (creates zeroed KV state buffers; MLState is
+ *        +new/-init UNAVAILABLE — only MLModel vends it)
+ *     - `- (nullable id<MLFeatureProvider>)predictionFromFeatures:
+ *        (id<MLFeatureProvider>)input usingState:(MLState*)state
+ *        error:(NSError**)error;`  (the in-place stateful decode step)
+ *   Apple's own "On-Device Llama 3.1 with Core ML" research post documents the
+ *   prefill-then-stateful-decode loop this backend's MLX/CoreML paths follow.
+ *
+ * Every hardware-specific assumption that can only be confirmed on Apple
+ * Silicon is marked `DEVICE-VERIFY` in the .mm. This header carries no SDK
+ * dependency and is safe to include anywhere.
+ */
+
+#include "../llm-backend.h"
+
+/* Free-function accessor returning the singleton `mlx-coreml` factory so the
+ * selector (llm-backend-selector.cpp, wired separately) can register it via
+ * `llm_backend_register(mlx_coreml_backend_factory())`. Defined in
+ * mlx-coreml-backend.mm. Always returns a valid non-null static-lifetime
+ * pointer — when the build gate is OFF the returned factory reports
+ * available()/can_serve() == false and open() == nullptr ("not compiled in"),
+ * so registering it unconditionally is safe. */
+LlmBackendFactory * mlx_coreml_backend_factory();
diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.mm b/tools/omnivoice/src/backends/mlx-coreml-backend.mm
new file mode 100644
index 000000000..4b705d719
--- /dev/null
+++ b/tools/omnivoice/src/backends/mlx-coreml-backend.mm
@@ -0,0 +1,797 @@
+/*
+ * mlx-coreml-backend.mm — Apple-Silicon streaming-LLM backend (cutover M5).
+ *
+ * Objective-C++ translation unit: CoreML's MLModel / MLState API is
+ * Objective-C, and the MLX C++ / mlx-c headers also compile cleanly in a
+ * `.mm`. See mlx-coreml-backend.h for the full API research + citations and
+ * the MLX-primary / CoreML-alternate trade-off.
+ *
+ * STRUCTURE
+ *   The whole real implementation sits behind
+ *     #if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+ *   and is the ONLY place that includes any MLX / CoreML SDK header. With the
+ *   gate OFF (the default Linux build) this file pulls in no SDK header at all
+ *   and compiles to a pure no-op factory: available()/can_serve() == false,
+ *   open() returns nullptr after setting *out_error to "not compiled in".
+ *
+ * ERROR CONTRACT (native/AGENTS.md §3 + §9): never log, never return a
+ * defaulted result on failure. Out-error strings are heap-allocated with
+ * malloc (mirroring eliza-inference-ffi.cpp's `eliza_strdup`) so the FFI
+ * caller frees them with `eliza_inference_free_string` / free().
+ */
+
+#include "mlx-coreml-backend.h"
+
+#include <atomic>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+// ===========================================================================
+// Shared (gate-independent) helpers
+// ===========================================================================
+
+namespace {
+
+/* Heap-allocate an out-error string the way the FFI translation unit does
+ * (eliza-inference-ffi.cpp::eliza_strdup) so the caller's free() path is
+ * identical regardless of which backend produced the error. */
+void mlx_set_error(char ** out_error, const std::string & msg) {
+    if (!out_error) {
+        return;
+    }
+    char * out = static_cast<char *>(std::malloc(msg.size() + 1));
+    if (!out) {
+        *out_error = nullptr;
+        return;
+    }
+    std::memcpy(out, msg.c_str(), msg.size() + 1);
+    *out_error = out;
+}
+
+} // namespace
+
+// ===========================================================================
+// REAL IMPLEMENTATION — Apple Silicon only, gated on ELIZA_ENABLE_MLX
+// ===========================================================================
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+
+// --- Objective-C / Apple frameworks ---------------------------------------
+#import <Foundation/Foundation.h>
+#import <CoreML/CoreML.h>     // MLModel, MLState, MLFeatureProvider, MLMultiArray
+#import <Metal/Metal.h>       // MTLCreateSystemDefaultDevice — Metal/ANE presence probe
+
+// --- MLX C API (ml-explore/mlx-c) ------------------------------------------
+// Only included behind the gate so a host without the MLX SDK still compiles.
+#include "mlx/c/array.h"
+#include "mlx/c/device.h"
+#include "mlx/c/stream.h"
+#include "mlx/c/io.h"
+#include "mlx/c/ops.h"
+#include "mlx/c/fast.h"
+#include "mlx/c/map.h"
+
+#include <dirent.h>
+#include <sys/stat.h>
+#include <cmath>
+#include <filesystem>
+#include <vector>
+
+namespace {
+
+namespace fs = std::filesystem;
+
+// --- bundle artifact discovery --------------------------------------------
+
+enum class AppleRuntime {
+    None,
+    Mlx,     // mlx weights dir (safetensors) or *.gguf under text/
+    CoreMl,  // *.mlmodelc / *.mlpackage under text/
+};
+
+bool has_suffix(const std::string & s, const char * suffix) {
+    const size_t n = std::strlen(suffix);
+    return s.size() >= n && std::equal(s.end() - n, s.end(), suffix);
+}
+
+/* Probe `<bundle_dir>/text/` for an Apple-servable artifact and report which
+ * runtime would serve it. MLX is preferred when both kinds are present (an
+ * mlx weights dir / gguf wins over a CoreML package), matching the header's
+ * "MLX primary, CoreML alternate" rule. Cheap directory walk, no model load. */
+AppleRuntime detect_runtime(const char * bundle_dir, std::string & out_artifact) {
+    out_artifact.clear();
+    if (!bundle_dir || bundle_dir[0] == '\0') {
+        return AppleRuntime::None;
+    }
+    std::error_code ec;
+    fs::path text_dir = fs::path(bundle_dir) / "text";
+    if (!fs::is_directory(text_dir, ec)) {
+        return AppleRuntime::None;
+    }
+
+    std::string gguf, mlpackage, mlmodelc, mlx_weights_dir;
+    for (fs::directory_iterator it(text_dir, ec), end; it != end && !ec; it.increment(ec)) {
+        const fs::path & p = it->path();
+        const std::string name = p.filename().string();
+        if (it->is_directory(ec)) {
+            // mlx-lm exports an `mlx` weights dir (model.safetensors + config.json),
+            // or a *.mlmodelc compiled CoreML model is itself a directory.
+            if (has_suffix(name, ".mlmodelc")) {
+                if (mlmodelc.empty()) mlmodelc = p.string();
+            } else if (name == "mlx" || fs::exists(p / "model.safetensors", ec) ||
+                       fs::exists(p / "weights.safetensors", ec)) {
+                if (mlx_weights_dir.empty()) mlx_weights_dir = p.string();
+            }
+        } else {
+            if (has_suffix(name, ".gguf")) {
+                if (gguf.empty()) gguf = p.string();
+            } else if (has_suffix(name, ".mlpackage")) {
+                if (mlpackage.empty()) mlpackage = p.string();
+            } else if (has_suffix(name, ".safetensors")) {
+                if (mlx_weights_dir.empty()) mlx_weights_dir = text_dir.string();
+            }
+        }
+    }
+
+    // MLX primary: weights dir / safetensors first, then gguf.
+    if (!mlx_weights_dir.empty()) { out_artifact = mlx_weights_dir; return AppleRuntime::Mlx; }
+    if (!gguf.empty())           { out_artifact = gguf;            return AppleRuntime::Mlx; }
+    // CoreML alternate: compiled model, then package.
+    if (!mlmodelc.empty())       { out_artifact = mlmodelc;        return AppleRuntime::CoreMl; }
+    if (!mlpackage.empty())      { out_artifact = mlpackage;       return AppleRuntime::CoreMl; }
+    return AppleRuntime::None;
+}
+
+/* True when a Metal device (hence GPU + ANE on Apple Silicon) is present.
+ * DEVICE-VERIFY: on a real Apple-Silicon Mac/phone this returns a valid
+ * MTLDevice; on a Mac without Metal (or an unexpected host) it is nil and the
+ * backend reports unavailable rather than crashing at open(). */
+bool metal_device_present() {
+    @autoreleasepool {
+        id<MTLDevice> dev = MTLCreateSystemDefaultDevice();
+        return dev != nil;
+    }
+}
+
+// ===========================================================================
+// MLX-backed session (PRIMARY)
+// ===========================================================================
+//
+// DEVICE-VERIFY: the decode graph below is structurally complete and uses the
+// real mlx-c symbols, but the exact per-layer wiring of the Gemma graph
+// (alternating local-SWA / global attention, dual head dims, shared-KV layer
+// reuse, Per-Layer-Embeddings) must be assembled + numerically validated on
+// Apple Silicon against mlx-lm's `gemma*` reference. The weight-tensor names,
+// quant group_size/bits, and rope base/scale are read from the model config at
+// load; they are not hardcoded here.
+
+class MlxLlmSession final : public LlmBackendSession {
+public:
+    MlxLlmSession(std::string artifact, const eliza_llm_stream_config_t * cfg)
+        : artifact_(std::move(artifact)) {
+        if (cfg) {
+            cfg_ = *cfg;
+            have_cfg_ = true;
+        }
+    }
+
+    ~MlxLlmSession() override {
+        free_kv();
+        // mlx_array handles are value types wrapping a refcounted ctx; freeing
+        // releases our reference. The Metal stream/device are process-global.
+    }
+
+    /* Load weights + build the resident graph. Returns ELIZA_OK or negative.
+     *
+     * The two on-disk shapes are loaded with the two distinct mlx-c readers:
+     *   - safetensors (mlx-lm convention): mlx_load_safetensors fills a
+     *     mlx_map_string_to_array keyed by tensor name (looked up per-tensor
+     *     via mlx_map_string_to_array_get when the graph is assembled);
+     *   - gguf: mlx_load_gguf fills a mlx_io_gguf whose tensors are read by
+     *     key via mlx_io_gguf_get_array (key list from mlx_io_gguf_get_keys).
+     * We keep whichever handle we loaded resident; the per-tensor pulls happen
+     * inside run_forward when the Gemma graph is assembled on Metal. */
+    int init(char ** out_error) {
+        // GPU stream (Metal). DEVICE-VERIFY: requires a Metal device.
+        gpu_stream_ = mlx_default_gpu_stream_new();
+
+        int rc;
+        if (has_suffix(artifact_, ".gguf")) {
+            gguf_ = mlx_io_gguf_new();
+            rc = mlx_load_gguf(&gguf_, artifact_.c_str(), gpu_stream_);
+            if (rc == 0) {
+                have_gguf_ = true;
+            }
+        } else {
+            // mlx weights dir / safetensors (the mlx-lm convention).
+            std::string file = artifact_;
+            std::error_code ec;
+            if (fs::is_directory(file, ec)) {
+                if (fs::exists(fs::path(file) / "model.safetensors", ec)) {
+                    file = (fs::path(file) / "model.safetensors").string();
+                } else if (fs::exists(fs::path(file) / "weights.safetensors", ec)) {
+                    file = (fs::path(file) / "weights.safetensors").string();
+                }
+            }
+            weights_ = mlx_map_string_to_array_new();
+            weights_meta_ = mlx_map_string_to_string_new();
+            rc = mlx_load_safetensors(&weights_, &weights_meta_, file.c_str(), gpu_stream_);
+            if (rc == 0) {
+                have_weights_ = true;
+            }
+        }
+        if (rc != 0) {
+            free_weights();
+            mlx_set_error(out_error,
+                "[mlx-coreml] MLX failed to load weights from " + artifact_);
+            return ELIZA_ERR_BUNDLE_INVALID;
+        }
+
+        // DEVICE-VERIFY: parse the sibling config.json (vocab, n_layer, head
+        // dims global/swa, sliding-window, rope base, shared-KV layer map, PLE
+        // table, quant bits/group_size) into graph_ here. Mirrors
+        // mlx_lm.utils.load's config handling. Left as the on-Metal assembly
+        // step — the streaming contract below does not depend on its details.
+        return ELIZA_OK;
+    }
+
+    int prefill(const int32_t * token_ids, size_t num_tokens,
+                char ** out_error) override {
+        if (!have_weights_) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill before init");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (!token_ids || num_tokens == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        cancel_.store(false);
+
+        // Copy the prompt (the contract says prefill copies the tokens it needs).
+        prompt_.assign(token_ids, token_ids + num_tokens);
+        n_past_ = 0;
+        generated_ = 0;
+
+        // Build the [1, T] int32 input and run one forward pass that fills KV.
+        // DEVICE-VERIFY: run_forward() must execute the Gemma decoder over the
+        // whole prompt at positions [0, T) and append to the resident KV
+        // arrays. The final-position logits feed the first sampled token.
+        const int shape[2] = {1, static_cast<int>(num_tokens)};
+        mlx_array input = mlx_array_new_data(prompt_.data(), shape, 2, MLX_INT32);
+        int rc = run_forward(input, /*start_pos=*/0, &last_logits_, out_error);
+        mlx_array_free(input);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        n_past_ = static_cast<int>(num_tokens);
+        return ELIZA_OK;
+    }
+
+    int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+             char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+             int32_t * drafter_accepted_out, char ** out_error) override {
+        if (num_tokens_out) *num_tokens_out = 0;
+        if (text_out && text_cap) text_out[0] = '\0';
+        // No speculative drafter on the MLX path yet (M6 wires MTP).
+        if (drafter_drafted_out)  *drafter_drafted_out  = 0;
+        if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+        if (!have_weights_) {
+            mlx_set_error(out_error, "[mlx-coreml] next before init/prefill");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancel_.load()) {
+            return ELIZA_ERR_CANCELLED;
+        }
+        if (!tokens_out || tokens_cap == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+
+        // Sample one token from last_logits_ (greedy here; temperature / top-p /
+        // top-k from cfg_ applied in sample_token).
+        // DEVICE-VERIFY: sample_token reads last_logits_ (an mlx_array of shape
+        // [1, vocab]) and returns one int32 token id.
+        int32_t next_id = 0;
+        int rc = sample_token(last_logits_, &next_id, out_error);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+
+        tokens_out[0] = next_id;
+        if (num_tokens_out) *num_tokens_out = 1;
+        generated_++;
+
+        // Detokenize the single committed token into text_out (UTF-8).
+        // DEVICE-VERIFY: detokenize_piece resolves next_id against the model's
+        // vocab (loaded from the tokenizer sidecar / gguf vocab) and writes the
+        // UTF-8 piece. Partial multi-byte pieces are buffered across calls.
+        detokenize_piece(next_id, text_out, text_cap);
+
+        const bool hit_eos = is_eos(next_id);
+        const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0)
+                                ? cfg_.max_tokens
+                                : default_max_tokens_;
+        const bool hit_cap = generated_ >= cap;
+        if (hit_eos || hit_cap) {
+            return 1; // final step
+        }
+
+        // Advance one position: forward pass for the just-sampled token only.
+        const int shape[2] = {1, 1};
+        mlx_array step_in = mlx_array_new_data(&next_id, shape, 2, MLX_INT32);
+        rc = run_forward(step_in, /*start_pos=*/n_past_, &last_logits_, out_error);
+        mlx_array_free(step_in);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        n_past_++;
+        return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more
+    }
+
+    int cancel() override {
+        cancel_.store(true);
+        return ELIZA_OK;
+    }
+
+    int reset() override {
+        cancel_.store(false);
+        prompt_.clear();
+        n_past_ = 0;
+        generated_ = 0;
+        free_kv();           // drop resident KV arrays
+        free_logits();
+        return ELIZA_OK;
+    }
+
+    int reset_keep(int32_t n_keep) override {
+        // MLX KV is a resident pair of arrays we append to; trimming to a prefix
+        // is a tensor slice. DEVICE-VERIFY: when the on-Metal KV slice is wired,
+        // keep [0, n_keep) of the K/V arrays and set n_past_ = clamp(n_keep).
+        // Until that lands, do the contract-mandated SAFE fallback: full reset,
+        // return 0 — never an error (llm-backend.h reset_keep contract).
+        (void) n_keep;
+        reset();
+        return 0;
+    }
+
+private:
+    void free_kv() {
+        if (have_kv_) {
+            mlx_array_free(kv_k_);
+            mlx_array_free(kv_v_);
+            have_kv_ = false;
+        }
+    }
+    void free_logits() {
+        if (have_logits_) {
+            mlx_array_free(last_logits_);
+            have_logits_ = false;
+        }
+    }
+
+    /* One transformer forward pass over `input` ([1, T] int32) starting at
+     * position `start_pos`, appending to the resident KV cache and writing the
+     * final-position logits ([1, vocab]) into *out_logits.
+     *
+     * DEVICE-VERIFY: this is the Gemma decoder graph. It must, per layer:
+     *   - embed tokens (+ Per-Layer-Embeddings) ;
+     *   - apply mlx_fast_rope with the layer's (global vs SWA) head dim ;
+     *   - run mlx_fast_scaled_dot_product_attention with mask_mode "causal" for
+     *     global layers and a windowed mask for SWA layers ;
+     *   - reuse earlier-layer KV on shared-KV layers ;
+     *   - mlx_quantized_matmul for quantized weight banks (group_size/bits from
+     *     config), mlx_matmul for f16 banks ;
+     *   - mlx_array_eval the result on gpu_stream_ to force materialization.
+     * The scaffolding owns the resident-KV bookkeeping; the per-op assembly is
+     * the on-Metal step validated against mlx-lm. */
+    int run_forward(mlx_array /*input*/, int /*start_pos*/, mlx_array * out_logits,
+                    char ** out_error) {
+        // Until the on-Metal graph is assembled, surface a precise, non-default
+        // failure (§3: never return a defaulted result). When the graph lands,
+        // this returns ELIZA_OK with *out_logits set and the KV appended.
+        free_logits();
+        (void) out_logits;
+        mlx_set_error(out_error,
+            "[mlx-coreml] MLX Gemma decode graph not assembled on this build "
+            "(DEVICE-VERIFY: requires Apple Silicon)");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    int sample_token(mlx_array logits, int32_t * out_id, char ** out_error) {
+        if (!have_logits_) {
+            mlx_set_error(out_error, "[mlx-coreml] no logits to sample");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        // DEVICE-VERIFY: apply cfg_.temperature / top_p / top_k / repeat_penalty
+        // then categorical sample; greedy argmax shown as the structural default.
+        mlx_array arg = mlx_array_new();
+        if (mlx_argmax_axis(&arg, logits, /*axis=*/-1, /*keepdims=*/false, gpu_stream_) != 0) {
+            mlx_array_free(arg);
+            mlx_set_error(out_error, "[mlx-coreml] argmax failed");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        mlx_array_eval(arg);
+        int32_t id = 0;
+        const int rc = mlx_array_item_int32(&id, arg);
+        mlx_array_free(arg);
+        if (rc != 0) {
+            mlx_set_error(out_error, "[mlx-coreml] failed to read sampled token");
+            return ELIZA_ERR_FFI_FAULT;
+        }
+        *out_id = id;
+        return ELIZA_OK;
+    }
+
+    bool is_eos(int32_t id) const {
+        // DEVICE-VERIFY: compare against the model's EOS / <end_of_turn> ids
+        // (Gemma uses <end_of_turn>) read from the tokenizer config at load.
+        return id == eos_id_;
+    }
+
+    void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) {
+        // DEVICE-VERIFY: resolve the token piece from the loaded vocab and copy
+        // its UTF-8 bytes (buffering partial code points across calls). The
+        // empty string here keeps the contract intact (committed id is already
+        // in tokens_out) until the vocab path is wired.
+        if (text_out && text_cap) {
+            text_out[0] = '\0';
+        }
+    }
+
+    std::string artifact_;
+    eliza_llm_stream_config_t cfg_{};
+    bool have_cfg_ = false;
+
+    mlx_stream gpu_stream_{};
+    mlx_map_string_to_array weights_{};
+    mlx_map_string_to_string weights_meta_{};
+    bool have_weights_ = false;
+
+    mlx_array kv_k_{};
+    mlx_array kv_v_{};
+    bool have_kv_ = false;
+
+    mlx_array last_logits_{};
+    bool have_logits_ = false;
+
+    std::vector<int32_t> prompt_;
+    int n_past_ = 0;
+    int generated_ = 0;
+    int32_t eos_id_ = -1;
+    int32_t default_max_tokens_ = 2048;
+
+    std::atomic<bool> cancel_{false};
+};
+
+// ===========================================================================
+// CoreML-backed session (ALTERNATE — ANE-bound, stateful MLState KV cache)
+// ===========================================================================
+//
+// DEVICE-VERIFY: the converted decoder package must expose (a) an input
+// feature for the current token id(s) and position, (b) an MLState-backed KV
+// cache, and (c) a logits output. Apple's "On-Device Llama 3.1 with Core ML"
+// post is the reference for the prefill-then-stateful-decode loop. We hold the
+// MLModel + its MLState and call predictionFromFeatures:usingState:error: per
+// step so the KV updates in-place inside CoreML (no per-token KV marshalling).
+
+class CoreMlLlmSession final : public LlmBackendSession {
+public:
+    CoreMlLlmSession(std::string package, const eliza_llm_stream_config_t * cfg)
+        : package_(std::move(package)) {
+        if (cfg) {
+            cfg_ = *cfg;
+            have_cfg_ = true;
+        }
+    }
+
+    ~CoreMlLlmSession() override {
+        @autoreleasepool {
+            state_ = nil;
+            model_ = nil;
+        }
+    }
+
+    int init(char ** out_error) {
+        @autoreleasepool {
+            NSError * err = nil;
+            NSURL * url = [NSURL fileURLWithPath:
+                [NSString stringWithUTF8String:package_.c_str()]];
+
+            NSURL * compiled = url;
+            // A *.mlpackage must be compiled to *.mlmodelc before loading; a
+            // *.mlmodelc loads directly. DEVICE-VERIFY: compileModelAtURL is a
+            // synchronous one-time compile; production caches the result.
+            if ([package_.c_str() ? @(package_.c_str()) : @"" hasSuffix:@".mlpackage"]) {
+                NSURL * c = [MLModel compileModelAtURL:url error:&err];
+                if (!c) {
+                    mlx_set_error(out_error, std::string(
+                        "[mlx-coreml] CoreML compile failed: ") +
+                        (err ? err.localizedDescription.UTF8String : "unknown"));
+                    return ELIZA_ERR_BUNDLE_INVALID;
+                }
+                compiled = c;
+            }
+
+            MLModelConfiguration * conf = [[MLModelConfiguration alloc] init];
+            // DEVICE-VERIFY: .all lets CoreML place the decoder on ANE when the
+            // converted graph is ANE-eligible, else GPU/CPU.
+            conf.computeUnits = MLComputeUnitsAll;
+
+            model_ = [MLModel modelWithContentsOfURL:compiled
+                                       configuration:conf
+                                               error:&err];
+            if (!model_) {
+                mlx_set_error(out_error, std::string(
+                    "[mlx-coreml] CoreML model load failed: ") +
+                    (err ? err.localizedDescription.UTF8String : "unknown"));
+                return ELIZA_ERR_BUNDLE_INVALID;
+            }
+
+            // newState vends zeroed KV buffers; MLState is +new/-init
+            // UNAVAILABLE — only MLModel produces it (macOS 15 / iOS 18).
+            state_ = [model_ newState];
+            if (!state_) {
+                mlx_set_error(out_error,
+                    "[mlx-coreml] CoreML model has no stateful KV cache "
+                    "(newState returned nil) — needs a stateful decoder package");
+                return ELIZA_ERR_BUNDLE_INVALID;
+            }
+            return ELIZA_OK;
+        }
+    }
+
+    int prefill(const int32_t * token_ids, size_t num_tokens,
+                char ** out_error) override {
+        if (!model_ || !state_) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill before init");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (!token_ids || num_tokens == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        cancel_.store(false);
+        prompt_.assign(token_ids, token_ids + num_tokens);
+        n_past_ = 0;
+        generated_ = 0;
+
+        // DEVICE-VERIFY: feed the whole prompt as one prediction with positions
+        // [0, T) so CoreML fills the MLState KV in one pass, then keep the
+        // final-position logits for the first sampled token. The feature names
+        // ("input_ids", "position", "logits") are dictated by the converted
+        // model's MLModelDescription — read them from model_.modelDescription.
+        return run_step(prompt_.data(), prompt_.size(), /*start_pos=*/0, out_error);
+    }
+
+    int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out,
+             char * text_out, size_t text_cap, int32_t * drafter_drafted_out,
+             int32_t * drafter_accepted_out, char ** out_error) override {
+        if (num_tokens_out) *num_tokens_out = 0;
+        if (text_out && text_cap) text_out[0] = '\0';
+        if (drafter_drafted_out)  *drafter_drafted_out  = 0;
+        if (drafter_accepted_out) *drafter_accepted_out = 0;
+
+        if (!model_ || !state_) {
+            mlx_set_error(out_error, "[mlx-coreml] next before init/prefill");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+        if (cancel_.load()) {
+            return ELIZA_ERR_CANCELLED;
+        }
+        if (!tokens_out || tokens_cap == 0) {
+            mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small");
+            return ELIZA_ERR_INVALID_ARG;
+        }
+
+        int32_t next_id = 0;
+        int rc = sample_from_last_logits(&next_id, out_error);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        tokens_out[0] = next_id;
+        if (num_tokens_out) *num_tokens_out = 1;
+        generated_++;
+        detokenize_piece(next_id, text_out, text_cap);
+
+        const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0)
+                                ? cfg_.max_tokens
+                                : default_max_tokens_;
+        if (is_eos(next_id) || generated_ >= cap) {
+            return 1; // final
+        }
+
+        // One stateful decode step for the just-sampled token.
+        const int32_t one = next_id;
+        rc = run_step(&one, 1, /*start_pos=*/n_past_, out_error);
+        if (rc != ELIZA_OK) {
+            return rc;
+        }
+        n_past_++;
+        return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more
+    }
+
+    int cancel() override {
+        cancel_.store(true);
+        return ELIZA_OK;
+    }
+
+    int reset() override {
+        cancel_.store(false);
+        prompt_.clear();
+        n_past_ = 0;
+        generated_ = 0;
+        @autoreleasepool {
+            // A fresh MLState zeroes the KV cache — the canonical CoreML reset.
+            if (model_) {
+                state_ = [model_ newState];
+            }
+        }
+        return ELIZA_OK;
+    }
+
+    int reset_keep(int32_t n_keep) override {
+        // CoreML's MLState is opaque: there is no public API to truncate the KV
+        // to a prefix. Per the llm-backend.h contract, fall back to a full
+        // reset and return 0 — never an error.
+        (void) n_keep;
+        reset();
+        return 0;
+    }
+
+private:
+    /* Run one prediction (`n` tokens starting at `start_pos`) through the
+     * stateful model, updating the MLState KV in place and caching the
+     * final-position logits. DEVICE-VERIFY: builds an MLFeatureProvider from
+     * the converted model's actual input descriptions and reads the logits
+     * MLMultiArray from the output provider. */
+    int run_step(const int32_t * /*tokens*/, size_t /*n*/, int /*start_pos*/,
+                 char ** out_error) {
+        // The feature-name binding is model-specific and only knowable from a
+        // real converted package, so surface a precise failure (§3) rather than
+        // a defaulted success. When the package is wired this calls
+        // predictionFromFeatures:usingState:error: and stores the logits.
+        mlx_set_error(out_error,
+            "[mlx-coreml] CoreML stateful decode not bound to a converted "
+            "decoder package on this build (DEVICE-VERIFY: requires a stateful "
+            "*.mlmodelc and Apple Silicon)");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    int sample_from_last_logits(int32_t * /*out_id*/, char ** out_error) {
+        // DEVICE-VERIFY: argmax / temperature-sample over the cached logits
+        // MLMultiArray. Fails precisely until run_step populates them.
+        mlx_set_error(out_error, "[mlx-coreml] no CoreML logits to sample");
+        return ELIZA_ERR_NOT_IMPLEMENTED;
+    }
+
+    bool is_eos(int32_t id) const { return id == eos_id_; }
+
+    void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) {
+        if (text_out && text_cap) {
+            text_out[0] = '\0';
+        }
+    }
+
+    std::string package_;
+    eliza_llm_stream_config_t cfg_{};
+    bool have_cfg_ = false;
+
+    MLModel * model_ = nil;
+    MLState * state_ = nil;
+
+    std::vector<int32_t> prompt_;
+    int n_past_ = 0;
+    int generated_ = 0;
+    int32_t eos_id_ = -1;
+    int32_t default_max_tokens_ = 2048;
+
+    std::atomic<bool> cancel_{false};
+};
+
+// ===========================================================================
+// Factory (real)
+// ===========================================================================
+
+class MlxCoreMlFactory final : public LlmBackendFactory {
+public:
+    const char * name() const override { return "mlx-coreml"; }
+
+    bool available() const override {
+        // Compiled in (we are inside the gate) AND a Metal device is present.
+        // DEVICE-VERIFY: true on Apple Silicon; false on a Mac without Metal.
+        return metal_device_present();
+    }
+
+    bool can_serve(const char * bundle_dir) const override {
+        std::string artifact;
+        return detect_runtime(bundle_dir, artifact) != AppleRuntime::None;
+    }
+
+    int preference_rank() const override {
+        // Highest on Apple Silicon: the in-process Metal/ANE path beats the
+        // in-tree llama.cpp Metal path for the Gemma geometry. > LiteRT(0 here).
+        return 100;
+    }
+
+    LlmBackendSession * open(EliInferenceContext * ctx,
+                             const eliza_llm_stream_config_t * cfg,
+                             char ** out_error) override {
+        // Resolve the bundle root from the context accessor (the struct is
+        // otherwise opaque here), then pick MLX vs CoreML from its artifacts.
+        const char * bundle_dir = llm_backend_context_bundle_dir(ctx);
+        const std::string bundle = bundle_dir ? bundle_dir : std::string();
+        if (bundle.empty()) {
+            mlx_set_error(out_error,
+                "[mlx-coreml] open: context has no bundle dir");
+            return nullptr;
+        }
+        std::string artifact;
+        const AppleRuntime rt = detect_runtime(bundle.c_str(), artifact);
+        if (rt == AppleRuntime::Mlx) {
+            auto * s = new MlxLlmSession(artifact, cfg);
+            const int rc = s->init(out_error);
+            if (rc != ELIZA_OK) {
+                delete s;
+                return nullptr;
+            }
+            return s;
+        }
+        if (rt == AppleRuntime::CoreMl) {
+            auto * s = new CoreMlLlmSession(artifact, cfg);
+            const int rc = s->init(out_error);
+            if (rc != ELIZA_OK) {
+                delete s;
+                return nullptr;
+            }
+            return s;
+        }
+        mlx_set_error(out_error,
+            "[mlx-coreml] open: bundle has no MLX/CoreML text artifact under text/");
+        return nullptr;
+    }
+};
+
+MlxCoreMlFactory g_factory;
+
+} // namespace
+
+LlmBackendFactory * mlx_coreml_backend_factory() {
+    return &g_factory;
+}
+
+// ===========================================================================
+// STUB IMPLEMENTATION — every non-Apple / gate-OFF build
+// ===========================================================================
+#else // !(ELIZA_ENABLE_MLX && __APPLE__)
+
+namespace {
+
+/* No SDK header is included on this path, so the file compiles on a plain
+ * Linux host. The factory reports itself unavailable and refuses to open. */
+class MlxCoreMlStubFactory final : public LlmBackendFactory {
+public:
+    const char * name() const override { return "mlx-coreml"; }
+    bool available() const override { return false; }
+    bool can_serve(const char * /*bundle_dir*/) const override { return false; }
+    int preference_rank() const override { return 0; }
+
+    LlmBackendSession * open(EliInferenceContext * /*ctx*/,
+                             const eliza_llm_stream_config_t * /*cfg*/,
+                             char ** out_error) override {
+        mlx_set_error(out_error,
+            "[mlx-coreml] backend not compiled in "
+            "(needs -DELIZA_ENABLE_MLX on Apple Silicon)");
+        return nullptr;
+    }
+};
+
+MlxCoreMlStubFactory g_stub_factory;
+
+} // namespace
+
+LlmBackendFactory * mlx_coreml_backend_factory() {
+    return &g_stub_factory;
+}
+
+#endif // ELIZA_ENABLE_MLX && __APPLE__
diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp
index 345c87cb0..94127affc 100644
--- a/tools/omnivoice/src/eliza-inference-ffi.cpp
+++ b/tools/omnivoice/src/eliza-inference-ffi.cpp
@@ -14,6 +14,7 @@
 // resolve `eliza_inference_*` symbols from this object.
 
 #include "eliza-inference-ffi.h"
+#include "llm-backend.h"
 #include "omnivoice.h"
 #include "llama.h"
 #include "mtmd.h"
@@ -173,6 +174,13 @@ struct EliInferenceContext {
 #endif
 };
 
+/* M3 seam accessor (declared in llm-backend.h): hand a backend's open() the
+ * bundle root without exposing the struct. Defined here where the type is
+ * complete. */
+const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx) {
+    return ctx ? ctx->bundle_dir.c_str() : nullptr;
+}
+
 /* ELZ2 magic 'ELZ1' (the ascii bytes 'E','L','Z','1' little-endian).
  * The magic stays 'ELZ1' across format versions — only the version
  * word at offset 4 changes between v1 and v2. */
@@ -1135,6 +1143,11 @@ static void reset_engine(Engine * e) {
 
 struct EliLlmStream {
     EliInferenceContext * ctx = nullptr;
+    /* Multi-backend seam (M3): when non-NULL, this session is driven by an
+     * alternate in-process runtime (LiteRT-LM / MLX-CoreML) and the llama.cpp
+     * fields below (lctx/sampler/mtp) are unused — every FFI streaming entry
+     * delegates to `backend` and returns before touching the llama.cpp path. */
+    LlmBackendSession * backend = nullptr;
     llama_context * lctx = nullptr;
     llama_sampler * sampler = nullptr;
     int n_past = 0;
@@ -2887,6 +2900,40 @@ EliLlmStream * eliza_inference_llm_stream_open(
         return nullptr;
     }
 
+    /* Multi-backend seam (M3): an alternate in-process runtime (LiteRT-LM /
+     * MLX-CoreML) may serve this bundle. The selector returns nullptr with NO
+     * error to keep the in-tree llama.cpp path below; nullptr WITH an error is a
+     * hard env-select failure to propagate. */
+    {
+        char * sel_err = nullptr;
+        LlmBackendFactory * factory =
+            llm_backend_select(ctx->bundle_dir.c_str(), cfg, &sel_err);
+        if (!factory && sel_err) {
+            if (out_error) {
+                *out_error = sel_err;
+            } else {
+                eliza_inference_free_string(sel_err);
+            }
+            return nullptr;
+        }
+        if (factory) {
+            EliLlmStream * bstream = new (std::nothrow) EliLlmStream();
+            if (!bstream) {
+                eliza_set_error(out_error,
+                    "[libelizainference] llm_stream_open: out of memory");
+                return nullptr;
+            }
+            bstream->ctx = ctx;
+            bstream->max_tokens = cfg->max_tokens > 0 ? cfg->max_tokens : 0;
+            bstream->backend = factory->open(ctx, cfg, out_error);
+            if (!bstream->backend) {
+                delete bstream;
+                return nullptr;
+            }
+            return bstream;
+        }
+    }
+
     llama_model * model = nullptr;
     {
         std::lock_guard<std::mutex> lock(ctx->llm_mutex);
@@ -2988,6 +3035,9 @@ int eliza_inference_llm_stream_prefill(
     const int32_t * token_ids,
     size_t num_tokens,
     char ** out_error) {
+    if (stream && stream->backend) {
+        return stream->backend->prefill(token_ids, num_tokens, out_error);
+    }
     if (!stream || (!stream->lctx && !stream->mtp)) {
         eliza_set_error(out_error,
             "[libelizainference] llm_stream_prefill: invalid session");
@@ -3056,6 +3106,11 @@ int eliza_inference_llm_stream_next(
     if (drafter_accepted_out) *drafter_accepted_out = 0;
     if (text_out && text_cap > 0) text_out[0] = '\0';
 
+    if (stream && stream->backend) {
+        return stream->backend->next(tokens_out, tokens_cap, num_tokens_out,
+                                     text_out, text_cap, drafter_drafted_out,
+                                     drafter_accepted_out, out_error);
+    }
     if (!stream || (!stream->mtp && (!stream->lctx || !stream->sampler))) {
         eliza_set_error(out_error,
             "[libelizainference] llm_stream_next: invalid session");
@@ -3245,6 +3300,9 @@ int eliza_inference_llm_stream_next(
 }
 
 int eliza_inference_llm_stream_cancel(EliLlmStream * stream) {
+    if (stream && stream->backend) {
+        return stream->backend->cancel();
+    }
     if (stream) {
         stream->cancel.store(true, std::memory_order_release);
     }
@@ -3255,6 +3313,9 @@ int eliza_inference_llm_stream_save_slot(
     EliLlmStream * stream,
     const char * filename,
     char ** out_error) {
+    if (stream && stream->backend) {
+        return stream->backend->save_slot(filename, out_error);
+    }
     (void) stream;
     (void) filename;
     /* v1: cross-launch slot KV persistence is not wired. Return a structured
@@ -3269,6 +3330,9 @@ int eliza_inference_llm_stream_restore_slot(
     EliLlmStream * stream,
     const char * filename,
     char ** out_error) {
+    if (stream && stream->backend) {
+        return stream->backend->restore_slot(filename, out_error);
+    }
     (void) stream;
     (void) filename;
     eliza_set_error(out_error,
@@ -3285,6 +3349,7 @@ int eliza_inference_llm_stream_reset(EliLlmStream * stream) {
      * created/destroyed repeatedly. Handles both the plain fixed-KV stream and
      * the MTP speculative engine (which owns its own target/draft KV). */
     if (!stream) return ELIZA_ERR_INVALID_ARG;
+    if (stream->backend) return stream->backend->reset();
     if (!stream->mtp && !stream->lctx) return ELIZA_ERR_INVALID_ARG;
     if (stream->mtp) {
         /* MTP stream: clear both the target and draft KV caches, reset the
@@ -3319,6 +3384,7 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep)
      * separate (riskier) handling — prefix-reuse mode opens the resident stream
      * without MTP, trading MTP's ~1.5x decode for the much larger prefill cut. */
     if (!stream) return ELIZA_ERR_INVALID_ARG;
+    if (stream->backend) return stream->backend->reset_keep(n_keep);
     if (stream->mtp || !stream->lctx) return ELIZA_ERR_INVALID_ARG;
     if (n_keep < 0) n_keep = 0;
     if (n_keep > stream->n_past) n_keep = stream->n_past;
@@ -3339,6 +3405,10 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep)
 
 void eliza_inference_llm_stream_close(EliLlmStream * stream) {
     if (!stream) return;
+    if (stream->backend) {
+        delete stream->backend;
+        stream->backend = nullptr;
+    }
     if (stream->mtp) {
         eliza_mtp::free_engine(stream->mtp);
         stream->mtp = nullptr;
diff --git a/tools/omnivoice/src/llm-backend-selector.cpp b/tools/omnivoice/src/llm-backend-selector.cpp
new file mode 100644
index 000000000..fa5fa703c
--- /dev/null
+++ b/tools/omnivoice/src/llm-backend-selector.cpp
@@ -0,0 +1,140 @@
+/*
+ * llm-backend-selector.cpp — registry + selection for the multi-runtime
+ * streaming-LLM seam (cutover plan M3).
+ *
+ * On a default build (no -DELIZA_ENABLE_* gate) NO alternate backend is
+ * registered, so llm_backend_select() always returns nullptr and the FFI keeps
+ * the in-tree llama.cpp path. The seam is therefore inert-by-default: the
+ * library behaves exactly as before until an accelerator backend is compiled in.
+ */
+
+#include "llm-backend.h"
+
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <mutex>
+#include <string>
+#include <vector>
+
+/* Gated backend factory accessors. Declared only when the matching backend is
+ * compiled in; register_builtins() calls them under the same gate. Keeping the
+ * declarations gated means the default build has no unresolved symbols. */
+#ifdef ELIZA_ENABLE_LITERT
+LlmBackendFactory * litert_backend_factory();
+#endif
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+LlmBackendFactory * mlx_coreml_backend_factory();
+#endif
+
+namespace {
+
+std::mutex                       g_reg_mutex;
+std::vector<LlmBackendFactory *> g_factories;
+std::once_flag                   g_builtins_once;
+
+/* Heap-allocate an error string with malloc so the caller can release it with
+ * eliza_inference_free_string() (which calls free()), matching the FFI contract. */
+char * dup_error(const std::string & msg) {
+    char * out = (char *) std::malloc(msg.size() + 1);
+    if (out) std::memcpy(out, msg.c_str(), msg.size() + 1);
+    return out;
+}
+
+bool iequals(const char * a, const char * b) {
+    if (!a || !b) return false;
+    while (*a && *b) {
+        if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) {
+            return false;
+        }
+        ++a;
+        ++b;
+    }
+    return *a == *b;
+}
+
+bool is_llamacpp_name(const char * s) {
+    return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama");
+}
+
+} // namespace
+
+void llm_backend_register(LlmBackendFactory * factory) {
+    if (!factory) return;
+    std::lock_guard<std::mutex> lock(g_reg_mutex);
+    for (LlmBackendFactory * f : g_factories) {
+        if (iequals(f->name(), factory->name())) return; /* idempotent by name */
+    }
+    g_factories.push_back(factory);
+}
+
+void llm_backend_register_builtins() {
+    std::call_once(g_builtins_once, []() {
+#ifdef ELIZA_ENABLE_LITERT
+        llm_backend_register(litert_backend_factory());
+#endif
+#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__)
+        llm_backend_register(mlx_coreml_backend_factory());
+#endif
+    });
+}
+
+LlmBackendFactory * llm_backend_select(const char * bundle_dir,
+                                       const eliza_llm_stream_config_t * /*cfg*/,
+                                       char ** out_error) {
+    llm_backend_register_builtins();
+
+    /* (1) ELIZA_LLM_BACKEND env: a HARD select. */
+    const char * forced = std::getenv("ELIZA_LLM_BACKEND");
+    if (forced && forced[0] != '\0') {
+        if (is_llamacpp_name(forced)) {
+            return nullptr; /* force the in-tree path, not an error */
+        }
+        std::lock_guard<std::mutex> lock(g_reg_mutex);
+        for (LlmBackendFactory * f : g_factories) {
+            if (!iequals(f->name(), forced)) continue;
+            if (!f->available()) {
+                if (out_error) {
+                    *out_error = dup_error(
+                        std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+                        " is not available in this build/host");
+                }
+                return nullptr;
+            }
+            if (!f->can_serve(bundle_dir)) {
+                if (out_error) {
+                    *out_error = dup_error(
+                        std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+                        " cannot serve the bundle at " +
+                        (bundle_dir ? bundle_dir : "(null)"));
+                }
+                return nullptr;
+            }
+            return f;
+        }
+        if (out_error) {
+            *out_error = dup_error(
+                std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced +
+                " is not a registered backend");
+        }
+        return nullptr;
+    }
+
+    /* (2) Auto-select: the highest preference_rank among available + can_serve.
+     * The in-tree llama.cpp path is the implicit rank-0 fallback, so an
+     * accelerator backend only wins when it returns a positive rank AND can
+     * serve this bundle. */
+    std::lock_guard<std::mutex> lock(g_reg_mutex);
+    LlmBackendFactory * best      = nullptr;
+    int                 best_rank = 0;
+    for (LlmBackendFactory * f : g_factories) {
+        if (!f->available()) continue;
+        if (!f->can_serve(bundle_dir)) continue;
+        const int rank = f->preference_rank();
+        if (rank > best_rank) {
+            best_rank = rank;
+            best      = f;
+        }
+    }
+    return best; /* nullptr => in-tree llama.cpp */
+}
diff --git a/tools/omnivoice/src/llm-backend.h b/tools/omnivoice/src/llm-backend.h
new file mode 100644
index 000000000..0fad67f3c
--- /dev/null
+++ b/tools/omnivoice/src/llm-backend.h
@@ -0,0 +1,167 @@
+#pragma once
+/*
+ * llm-backend.h — multi-runtime streaming-LLM backend seam (cutover plan M3).
+ *
+ * The libelizainference streaming-LLM FFI (`eliza_inference_llm_stream_*`) is
+ * ONE pipe that can be driven by more than one in-process inference runtime:
+ *
+ *   - llama.cpp   — the default / reference backend (CPU / CUDA / Vulkan-Mali-
+ *                   Adreno / Metal). Always present; the in-tree code path.
+ *   - LiteRT-LM   — Android NPU (Tensor / Qualcomm QNN / MediaTek NeuroPilot),
+ *                   optionally desktop/iOS GPU. Gated -DELIZA_ENABLE_LITERT.
+ *   - CoreML/MLX  — Apple Silicon (mac first, iOS later). Gated -DELIZA_ENABLE_MLX.
+ *
+ * Per native/AGENTS.md §11 (reinterpreted by the Gemma-4 cutover): "one managed
+ * library, one pipe, no sidecar/subprocess/TCP." LiteRT-LM and MLX are
+ * EMBEDDABLE in-process C++ libraries linked INTO libelizainference and exposed
+ * behind the SAME FFI streaming symbols — never a child process or TCP server.
+ * (AICore / Apple Foundation stay opportunistic out-of-process adapters on the
+ * TS side, not owned backends — they are NOT registered here.)
+ *
+ * A backend supplies:
+ *   - LlmBackendSession  — the per-generation streaming session, mirroring the
+ *                          FFI pull contract (prefill -> next* -> close) 1:1 so
+ *                          the FFI functions delegate without translation.
+ *   - LlmBackendFactory  — names the runtime, reports availability + bundle fit,
+ *                          and opens sessions.
+ *
+ * `llm_backend_select()` picks a backend at `_open` time from the platform, the
+ * bundle contents, the build flags, and the `ELIZA_LLM_BACKEND` env override.
+ * When it returns nullptr (and no error) the FFI keeps the in-tree llama.cpp
+ * path — so a build with no alternate backend behaves exactly as before.
+ */
+
+#include "eliza-inference-ffi.h" /* eliza_llm_stream_config_t, EliInferenceContext fwd */
+
+#include <cstddef>
+#include <cstdint>
+
+/* Defined in the FFI translation unit. Opaque to backends — a backend reaches
+ * the resident model/bundle through the accessors below, not the struct. */
+struct EliInferenceContext;
+
+/* The bundle directory the context was opened against. A backend's open()
+ * resolves its own artifact under this root (e.g. `<dir>/text/*.litertlm`,
+ * `<dir>/text/*.mlpackage`) — the ONLY supported way to read the bundle path,
+ * since the struct itself is opaque here. Returns nullptr when ctx is null.
+ * Defined in eliza-inference-ffi.cpp; the pointer is owned by the context and
+ * stays valid for the session's lifetime. */
+const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx);
+
+/* ---- Per-generation streaming session ------------------------------------ *
+ *
+ * Lifetime: created by LlmBackendFactory::open(), destroyed via `delete` on the
+ * FFI `_close` path. Every method mirrors the matching FFI entry point so the
+ * FFI can `return session->method(...)` with no argument translation. Status
+ * conventions are identical to the FFI: >= 0 on success, the negative `ELIZA_*`
+ * constants on failure, with `*out_error` heap-allocated for the caller to free.
+ */
+struct LlmBackendSession {
+    virtual ~LlmBackendSession() = default;
+
+    /* Mirrors eliza_inference_llm_stream_prefill. Copies the tokens it needs. */
+    virtual int prefill(const int32_t * token_ids, size_t num_tokens,
+                        char ** out_error) = 0;
+
+    /* Mirrors eliza_inference_llm_stream_next. Returns 0 (more output), 1 (final
+     * step — EOS / cap), or a negative ELIZA_* code (ELIZA_ERR_CANCELLED on
+     * cancel). `drafter_*_out` carry per-step speculative stats (0 when the
+     * backend has no drafter). */
+    virtual int next(int32_t * tokens_out, size_t tokens_cap,
+                     size_t * num_tokens_out, char * text_out, size_t text_cap,
+                     int32_t * drafter_drafted_out, int32_t * drafter_accepted_out,
+                     char ** out_error) = 0;
+
+    /* Mirrors eliza_inference_llm_stream_cancel. Publishes a flag an in-flight
+     * next() checks at a step boundary; safe to call from another thread.
+     * Returns ELIZA_OK whether or not a pass was running. */
+    virtual int cancel() = 0;
+
+    /* Mirrors eliza_inference_llm_stream_reset: clear KV + sampler/counters so
+     * the next prefill starts a fresh prompt on the same warm session. */
+    virtual int reset() = 0;
+
+    /* Mirrors eliza_inference_llm_stream_reset_keep: keep the first `n_keep`
+     * tokens of state resident and drop the rest. Returns the n_keep actually
+     * applied (>= 0, may be clamped / 0 on a full-reset fallback). A backend
+     * that cannot do prefix reuse MUST fall back to a full reset and return 0 —
+     * never an error. */
+    virtual int reset_keep(int32_t n_keep) = 0;
+
+    /* Slot KV persistence — optional. Default: not supported. */
+    virtual int save_slot(const char * /*filename*/, char ** /*out_error*/) {
+        return ELIZA_ERR_INVALID_ARG;
+    }
+    virtual int restore_slot(const char * /*filename*/, char ** /*out_error*/) {
+        return ELIZA_ERR_INVALID_ARG;
+    }
+};
+
+/* ---- Backend factory (one per linked-in runtime) ------------------------- */
+struct LlmBackendFactory {
+    virtual ~LlmBackendFactory() = default;
+
+    /* Stable lower-case id: "llama.cpp", "litert-lm", "mlx-coreml". Matched
+     * case-insensitively against ELIZA_LLM_BACKEND. */
+    virtual const char * name() const = 0;
+
+    /* True only when this backend is compiled in AND its runtime dependencies
+     * are present on THIS host (the NPU delegate / Metal device / the linked
+     * lib). A scaffold whose build gate is OFF returns false. Cheap — must not
+     * load a model. */
+    virtual bool available() const = 0;
+
+    /* True when this backend can serve the bundle at `bundle_dir` — i.e. the
+     * backend-specific artifact exists (e.g. `text/*.litertlm`, `text/*.mlpackage`).
+     * Cheap directory probe, no model load. */
+    virtual bool can_serve(const char * bundle_dir) const = 0;
+
+    /* Platform-affinity rank used to order candidates when several can serve the
+     * same bundle and no env override is set. Higher wins. The in-tree llama.cpp
+     * path is rank 0 (the implicit fallback); an accelerator backend that is the
+     * preferred path on this device returns a positive value. */
+    virtual int preference_rank() const { return 0; }
+
+    /* Create a streaming session for (ctx, cfg). Returns nullptr + `*out_error`
+     * on failure. The returned session is owned by the caller (FFI `_close`
+     * deletes it). */
+    virtual LlmBackendSession * open(EliInferenceContext * ctx,
+                                     const eliza_llm_stream_config_t * cfg,
+                                     char ** out_error) = 0;
+};
+
+/* ---- Registry + selection ------------------------------------------------ *
+ *
+ * Backends register their singleton factory (idempotent; the registry does not
+ * take ownership — factories are static-lifetime singletons). The FFI
+ * translation unit calls llm_backend_register_builtins() once to register every
+ * compiled-in backend, then calls llm_backend_select() per `_open`.
+ */
+
+/* Register a factory (idempotent by name). Safe to call from static init. */
+void llm_backend_register(LlmBackendFactory * factory);
+
+/* Register every backend compiled into THIS build (gated by the -DELIZA_ENABLE_*
+ * CMake options). Idempotent; call once at first `_open`. Defined in
+ * llm-backend-selector.cpp; the gated backends self-register via their headers. */
+void llm_backend_register_builtins();
+
+/* Pick a backend for the bundle at `bundle_dir` with `cfg`. Resolution order:
+ *
+ *   1. ELIZA_LLM_BACKEND env (exact, case-insensitive backend name) — a HARD
+ *      select. "llama.cpp" / "llamacpp" forces the in-tree path (returns
+ *      nullptr, no error). Any other name that is not registered+available, or
+ *      cannot serve the bundle, is a hard error: returns nullptr AND sets
+ *      `*out_error` so the FFI aborts rather than silently using llama.cpp.
+ *
+ *   2. No env override: among registered backends that are available() AND
+ *      can_serve(bundle_dir), pick the highest preference_rank(). If none
+ *      qualifies, return nullptr (use the in-tree llama.cpp path).
+ *
+ * A nullptr return with `*out_error == nullptr` means "use the in-tree llama.cpp
+ * path" — NOT an error. A nullptr return with `*out_error != nullptr` is a hard
+ * failure the caller must propagate.
+ */
+LlmBackendFactory * llm_backend_select(const char * bundle_dir,
+                                       const eliza_llm_stream_config_t * cfg,
+                                       char ** out_error);