elizaOS · lalalune · Jun 2, 2026 · Jun 22, 2026
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1990,7 +1990,19 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla
         uint32_t cell_range_begin = cells.size();
 
         for (uint32_t i = 0; i < cells.size(); ++i) {
-            if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) {
+            bool add_cell = true;
+
+            add_cell = add_cell && !cells.is_empty(i);
+            add_cell = add_cell && (seq_id == -1 || cells.seq_has(i, seq_id));
+
+            // check the cell is not SWA-masked
+            if (add_cell && seq_id != -1) {
+                const bool is_masked = llama_hparams::is_masked_swa(n_swa, swa_type, cells.pos_get(i), cells.seq_pos_max(seq_id));
+
+                add_cell = !is_masked;
+            }
+
+            if (add_cell) {
                 ++cell_count;
                 if (cell_range_begin == cells.size()) {
                     cell_range_begin = i;
@@ -2246,7 +2258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32
 
         sinfo = find_slot(ubatch, false);
         if (sinfo.empty()) {
-            LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
+            LLAMA_LOG_ERROR("%s: failed to find %d available cells in kv cache\n", __func__,  cell_count);
             return false;
         }
 

diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
@@ -78,6 +78,11 @@ set(OMNIVOICE_CORE_SOURCES
 # llama + mtmd into a single ABI-stable C surface.
 set(OMNIVOICE_FFI_SOURCES
     src/eliza-inference-ffi.cpp
+    # Multi-runtime streaming-LLM backend seam (cutover plan M3). The selector
+    # is always compiled; it is inert until a -DELIZA_ENABLE_* accelerator
+    # backend below registers itself, so the default build keeps the in-tree
+    # llama.cpp path.
+    src/llm-backend-selector.cpp
 )
 
 # Vendored standalone voice-classifier forward graphs (pure scalar C, no
@@ -220,6 +225,19 @@ endif()
 # (the fused build links mtmd anyway), opt out with -DELIZA_ENABLE_VISION=OFF.
 option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON)
 
+# ELIZA_ENABLE_LITERT — compile the LiteRT-LM in-process streaming-LLM backend
+# (cutover plan M4 — Android NPU: Tensor / Qualcomm QNN / MediaTek NeuroPilot).
+# OFF by default: the selector registers no LiteRT backend and the streaming-LLM
+# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK
+# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI
+# default. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF)
+
+# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend
+# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and
+# requires the MLX / CoreML toolchain. See docs/multi-backend-ffi-seam.md.
+option(ELIZA_ENABLE_MLX "Build the CoreML/MLX in-process LLM backend (M5)" OFF)
+
 if(TARGET mtmd)
     add_library(elizainference SHARED
         ${OMNIVOICE_CORE_SOURCES}
@@ -271,6 +289,48 @@ if(TARGET mtmd)
             ${CMAKE_CURRENT_SOURCE_DIR}/../kokoro/include)
         target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_KOKORO)
     endif()
+    # ── Multi-runtime streaming-LLM accelerator backends (cutover M4/M5) ────
+    # The M3 selector (src/llm-backend-selector.cpp) is always compiled in via
+    # OMNIVOICE_FFI_SOURCES. These accelerator backends each link an external
+    # SDK, so they are opt-in. When a gate is OFF its source is not compiled,
+    # the selector's `#ifdef`-guarded factory declaration + registration drop
+    # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the
+    # default desktop/CI build is byte-for-byte the pre-seam behavior.
+    if(ELIZA_ENABLE_LITERT)
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT)
+        # LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built
+        # SDK with -DELIZA_LITERT_SDK_DIR=<dir>; the device/host cross-build
+        # links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=<libs>.
+        if(ELIZA_LITERT_SDK_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib)
+        endif()
+        if(ELIZA_LITERT_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS})
+        endif()
+    endif()
+    if(ELIZA_ENABLE_MLX)
+        if(NOT APPLE)
+            message(FATAL_ERROR
+                "ELIZA_ENABLE_MLX requires an Apple host (CoreML/MLX are Apple-only).")
+        endif()
+        target_sources(elizainference PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/mlx-coreml-backend.mm)
+        target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_MLX)
+        # MLX C API (ml-explore/mlx-c) via -DELIZA_MLX_C_DIR / -DELIZA_MLX_LIBS,
+        # plus the system CoreML / Metal / Foundation frameworks.
+        if(ELIZA_MLX_C_DIR)
+            target_include_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/include)
+            target_link_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/lib)
+        endif()
+        if(ELIZA_MLX_LIBS)
+            target_link_libraries(elizainference PRIVATE ${ELIZA_MLX_LIBS})
+        endif()
+        target_link_libraries(elizainference PRIVATE
+            "-framework Foundation" "-framework CoreML" "-framework Metal")
+    endif()
     set_target_properties(elizainference PROPERTIES
         OUTPUT_NAME              elizainference
         POSITION_INDEPENDENT_CODE ON)