diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index cd9666a21..0eb9dad8a 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1990,7 +1990,19 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla uint32_t cell_range_begin = cells.size(); for (uint32_t i = 0; i < cells.size(); ++i) { - if (!cells.is_empty(i) && (seq_id == -1 || cells.seq_has(i, seq_id))) { + bool add_cell = true; + + add_cell = add_cell && !cells.is_empty(i); + add_cell = add_cell && (seq_id == -1 || cells.seq_has(i, seq_id)); + + // check the cell is not SWA-masked + if (add_cell && seq_id != -1) { + const bool is_masked = llama_hparams::is_masked_swa(n_swa, swa_type, cells.pos_get(i), cells.seq_pos_max(seq_id)); + + add_cell = !is_masked; + } + + if (add_cell) { ++cell_count; if (cell_range_begin == cells.size()) { cell_range_begin = i; @@ -2246,7 +2258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t strm, uint32 sinfo = find_slot(ubatch, false); if (sinfo.empty()) { - LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + LLAMA_LOG_ERROR("%s: failed to find %d available cells in kv cache\n", __func__, cell_count); return false; } diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt index 72088dfa0..6cb3e13a6 100644 --- a/tools/omnivoice/CMakeLists.txt +++ b/tools/omnivoice/CMakeLists.txt @@ -78,6 +78,11 @@ set(OMNIVOICE_CORE_SOURCES # llama + mtmd into a single ABI-stable C surface. set(OMNIVOICE_FFI_SOURCES src/eliza-inference-ffi.cpp + # Multi-runtime streaming-LLM backend seam (cutover plan M3). The selector + # is always compiled; it is inert until a -DELIZA_ENABLE_* accelerator + # backend below registers itself, so the default build keeps the in-tree + # llama.cpp path. + src/llm-backend-selector.cpp ) # Vendored standalone voice-classifier forward graphs (pure scalar C, no @@ -220,6 +225,19 @@ endif() # (the fused build links mtmd anyway), opt out with -DELIZA_ENABLE_VISION=OFF. option(ELIZA_ENABLE_VISION "Build the fused mmproj vision-describe ABI (v9)" ON) +# ELIZA_ENABLE_LITERT — compile the LiteRT-LM in-process streaming-LLM backend +# (cutover plan M4 — Android NPU: Tensor / Qualcomm QNN / MediaTek NeuroPilot). +# OFF by default: the selector registers no LiteRT backend and the streaming-LLM +# pipe keeps the in-tree llama.cpp path. ON requires the LiteRT-LM SDK +# (ELIZA_LITERT_SDK_DIR) — a host/device cross-build concern, not the Linux CI +# default. See docs/multi-backend-ffi-seam.md. +option(ELIZA_ENABLE_LITERT "Build the LiteRT-LM in-process LLM backend (M4)" OFF) + +# ELIZA_ENABLE_MLX — compile the CoreML/MLX in-process streaming-LLM backend +# (cutover plan M5 — Apple Silicon). OFF by default; ON is Apple-only and +# requires the MLX / CoreML toolchain. See docs/multi-backend-ffi-seam.md. +option(ELIZA_ENABLE_MLX "Build the CoreML/MLX in-process LLM backend (M5)" OFF) + if(TARGET mtmd) add_library(elizainference SHARED ${OMNIVOICE_CORE_SOURCES} @@ -271,6 +289,48 @@ if(TARGET mtmd) ${CMAKE_CURRENT_SOURCE_DIR}/../kokoro/include) target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_KOKORO) endif() + # ── Multi-runtime streaming-LLM accelerator backends (cutover M4/M5) ──── + # The M3 selector (src/llm-backend-selector.cpp) is always compiled in via + # OMNIVOICE_FFI_SOURCES. These accelerator backends each link an external + # SDK, so they are opt-in. When a gate is OFF its source is not compiled, + # the selector's `#ifdef`-guarded factory declaration + registration drop + # out, and the streaming-LLM pipe keeps the in-tree llama.cpp path — so the + # default desktop/CI build is byte-for-byte the pre-seam behavior. + if(ELIZA_ENABLE_LITERT) + target_sources(elizainference PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/litert-backend.cpp) + target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_LITERT) + # LiteRT-LM SDK (github.com/google-ai-edge/LiteRT-LM). Point at a built + # SDK with -DELIZA_LITERT_SDK_DIR=; the device/host cross-build + # links its libs + the NPU delegates with -DELIZA_LITERT_LIBS=. + if(ELIZA_LITERT_SDK_DIR) + target_include_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/include) + target_link_directories(elizainference PRIVATE ${ELIZA_LITERT_SDK_DIR}/lib) + endif() + if(ELIZA_LITERT_LIBS) + target_link_libraries(elizainference PRIVATE ${ELIZA_LITERT_LIBS}) + endif() + endif() + if(ELIZA_ENABLE_MLX) + if(NOT APPLE) + message(FATAL_ERROR + "ELIZA_ENABLE_MLX requires an Apple host (CoreML/MLX are Apple-only).") + endif() + target_sources(elizainference PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/backends/mlx-coreml-backend.mm) + target_compile_definitions(elizainference PRIVATE ELIZA_ENABLE_MLX) + # MLX C API (ml-explore/mlx-c) via -DELIZA_MLX_C_DIR / -DELIZA_MLX_LIBS, + # plus the system CoreML / Metal / Foundation frameworks. + if(ELIZA_MLX_C_DIR) + target_include_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/include) + target_link_directories(elizainference PRIVATE ${ELIZA_MLX_C_DIR}/lib) + endif() + if(ELIZA_MLX_LIBS) + target_link_libraries(elizainference PRIVATE ${ELIZA_MLX_LIBS}) + endif() + target_link_libraries(elizainference PRIVATE + "-framework Foundation" "-framework CoreML" "-framework Metal") + endif() set_target_properties(elizainference PROPERTIES OUTPUT_NAME elizainference POSITION_INDEPENDENT_CODE ON) diff --git a/tools/omnivoice/src/backends/litert-backend.cpp b/tools/omnivoice/src/backends/litert-backend.cpp new file mode 100644 index 000000000..3b3dad137 --- /dev/null +++ b/tools/omnivoice/src/backends/litert-backend.cpp @@ -0,0 +1,471 @@ +/* + * litert-backend.cpp — LiteRT-LM in-process streaming-LLM backend (M4). + * + * See litert-backend.h for the targeted LiteRT-LM C++ API (repo + commit + * date cited there). The real implementation is gated behind + * `ELIZA_ENABLE_LITERT`; the default (Linux/desktop) build compiles the stub + * branch, which links zero LiteRT-LM SDK headers and reports + * `available() == false` so the selector keeps the in-tree llama.cpp path. + * + * Error contract (native/AGENTS.md §3 + §9): never log, never return a + * defaulted result on failure. Every failure path heap-allocates `*out_error` + * via litert_set_error() (matching the FFI cpp's eliza_strdup/eliza_set_error + * style) and returns the negative ELIZA_* code or nullptr. + */ + +#include "litert-backend.h" + +#include +#include +#include +#include +#include + +#if defined(__has_include) +# if __has_include() +# include +# define LITERT_HAVE_FILESYSTEM 1 +# endif +#endif + +/* ── Heap-allocated error strings (mirror eliza-inference-ffi.cpp) ───────── */ +namespace { + +char * litert_strdup(const std::string & s) { + char * out = static_cast(std::malloc(s.size() + 1)); + if (!out) return nullptr; + std::memcpy(out, s.c_str(), s.size() + 1); + return out; +} + +void litert_set_error(char ** out_error, const std::string & msg) { + if (!out_error) return; + *out_error = litert_strdup(msg); +} + +#if defined(LITERT_HAVE_FILESYSTEM) +/* Probe /text/ for a *.litertlm artifact. Cheap directory walk, + * no model load (LlmBackendFactory::can_serve contract). */ +std::string find_litertlm_artifact(const char * bundle_dir) { + if (!bundle_dir || bundle_dir[0] == '\0') return std::string(); + std::error_code ec; + std::filesystem::path text_dir = + std::filesystem::path(bundle_dir) / LITERT_BUNDLE_TEXT_SUBDIR; + if (!std::filesystem::is_directory(text_dir, ec)) return std::string(); + for (std::filesystem::directory_iterator it(text_dir, ec), end; + !ec && it != end; it.increment(ec)) { + if (!it->is_regular_file(ec)) continue; + if (it->path().extension() == LITERT_ARTIFACT_EXT) { + return it->path().string(); + } + } + return std::string(); +} +#else +std::string find_litertlm_artifact(const char *) { return std::string(); } +#endif + +} // namespace + +/* ════════════════════════════════════════════════════════════════════════ * + * REAL implementation — only when ELIZA_ENABLE_LITERT is defined. + * Behind this gate we may include LiteRT-LM SDK headers; outside it we + * include NONE so the file builds on a host without the SDK. + * ════════════════════════════════════════════════════════════════════════ */ +#ifdef ELIZA_ENABLE_LITERT + +#include +#include +#include +#include +#include + +/* LiteRT-LM cross-platform C++ runtime. Paths per the repo's bazel layout + * (github.com/google-ai-edge/LiteRT-LM, `main`, researched 2026-06-22). */ +#include "runtime/engine/engine.h" // litert::lm::Engine, SessionInterface +#include "runtime/engine/engine_settings.h" // EngineSettings, SessionConfig, ModelAssets +#include "runtime/engine/io_types.h" // InputData, InputText, Responses + +namespace { + +using litert::lm::Backend; +using litert::lm::Engine; +using litert::lm::EngineSettings; +using litert::lm::InputData; +using litert::lm::InputText; +using litert::lm::ModelAssets; +using litert::lm::Responses; +using litert::lm::SessionConfig; + +/* The Session type the templated Engine hands back (Engine::Session is the + * public alias EngineT exposes; for Engine it is SessionInterface). */ +using Session = Engine::Session; + +/* The accelerator the factory resolved at open(), recorded for diagnostics + * and preference reporting. DEVICE-VERIFY: which rung actually initializes is + * hardware-dependent and can only be confirmed on an NPU/GPU device. */ +enum class ResolvedAccelerator { kNone, kNpu, kGpu, kCpu }; + +const char * accelerator_name(ResolvedAccelerator a) { + switch (a) { + case ResolvedAccelerator::kNpu: return "npu"; + case ResolvedAccelerator::kGpu: return "gpu"; + case ResolvedAccelerator::kCpu: return "cpu"; + default: return "none"; + } +} + +/* Try to build an Engine for `artifact` on `backend`. Returns the Engine on + * success; on failure returns nullptr (the ladder falls through to the next + * rung). The error text is captured so the final rung can surface it. */ +std::unique_ptr try_engine(const std::string & artifact, + Backend backend, + std::string & last_err) { + auto model_assets = ModelAssets::Create(artifact); + if (!model_assets.ok()) { + last_err = std::string(model_assets.status().message()); + return nullptr; + } + auto settings = EngineSettings::CreateDefault(*model_assets, backend); + if (!settings.ok()) { + last_err = std::string(settings.status().message()); + return nullptr; + } + auto engine = Engine::CreateEngine(*settings); + if (!engine.ok()) { + last_err = std::string(engine.status().message()); + return nullptr; + } + return std::move(*engine); +} + +/* ── Session: mirrors the FFI streaming pull contract 1:1 ────────────────── */ +class LiteRtBackendSession final : public LlmBackendSession { +public: + LiteRtBackendSession(std::unique_ptr engine, + std::unique_ptr session, + const eliza_llm_stream_config_t & cfg, + ResolvedAccelerator accel) + : engine_(std::move(engine)), + session_(std::move(session)), + accel_(accel), + max_tokens_(cfg.max_tokens > 0 ? cfg.max_tokens : 0) {} + + /* prefill: copy the caller's tokens, detokenize through the engine's + * tokenizer, and run a LiteRT prefill pass. The FFI hands pre-tokenized + * ids (text-model vocab); LiteRT-LM's prefill consumes InputData (text), + * so we round-trip ids → text via the shared tokenizer rather than + * assuming vocab parity (the .litertlm graph carries its own tokenizer). + * DEVICE-VERIFY: id/text round-trip fidelity needs a real .litertlm. */ + int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) override { + if (!session_) { + litert_set_error(out_error, + "[litert-lm] prefill: session is not open"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancelled_.load(std::memory_order_acquire)) { + return ELIZA_ERR_CANCELLED; + } + std::vector ids; + ids.reserve(num_tokens); + for (size_t i = 0; i < num_tokens; ++i) ids.push_back(token_ids[i]); + + const std::string text = engine_->GetTokenizer().Detokenize(ids); + std::vector contents; + contents.emplace_back(InputText(std::string(text))); + + absl::Status st = session_->RunPrefill(contents); + if (!st.ok()) { + litert_set_error(out_error, + std::string("[litert-lm] RunPrefill failed: ") + + std::string(st.message())); + return ELIZA_ERR_FFI_FAULT; + } + prefilled_ = true; + return ELIZA_OK; + } + + /* next: one decode step. LiteRT-LM's RunDecode() returns a Responses + * batch; we emit the newly-produced UTF-8 delta as detokenized text and + * its token ids. LiteRT-LM has no in-process MTP drafter exposed through + * this surface, so drafted/accepted are always 0. Returns 1 (final) at + * EOS or the max-token cap, 0 otherwise. */ + int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out, + char * text_out, size_t text_cap, int32_t * drafter_drafted_out, + int32_t * drafter_accepted_out, char ** out_error) override { + if (num_tokens_out) *num_tokens_out = 0; + if (text_out && text_cap) text_out[0] = '\0'; + if (drafter_drafted_out) *drafter_drafted_out = 0; + if (drafter_accepted_out) *drafter_accepted_out = 0; + + if (!session_) { + litert_set_error(out_error, "[litert-lm] next: session not open"); + return ELIZA_ERR_INVALID_ARG; + } + if (!prefilled_) { + litert_set_error(out_error, + "[litert-lm] next: prefill must run before next"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancelled_.load(std::memory_order_acquire)) { + return ELIZA_ERR_CANCELLED; + } + + auto responses = session_->RunDecode(); + if (!responses.ok()) { + litert_set_error(out_error, + std::string("[litert-lm] RunDecode failed: ") + + std::string(responses.status().message())); + return ELIZA_ERR_FFI_FAULT; + } + + /* RunDecode yields the running candidate texts; GetTexts()[0] is the + * cumulative decode for candidate 0. Emit only the suffix produced + * since the last step so the FFI streams a delta per pull. */ + const std::vector & texts = responses->GetTexts(); + std::string cumulative = texts.empty() ? std::string() : texts.front(); + std::string delta = compute_delta(cumulative); + emitted_chars_ = cumulative.size(); + + /* Re-tokenize the delta against the engine tokenizer so the FFI gets + * committed text-vocab ids (the same round-trip the prefill used). */ + std::vector delta_ids = engine_->GetTokenizer().Tokenize(delta); + size_t n_emit = delta_ids.size(); + if (n_emit > tokens_cap) n_emit = tokens_cap; + if (tokens_out) { + for (size_t i = 0; i < n_emit; ++i) { + tokens_out[i] = static_cast(delta_ids[i]); + } + } + if (num_tokens_out) *num_tokens_out = n_emit; + if (text_out && text_cap) { + const size_t copy = delta.size() < text_cap - 1 + ? delta.size() + : text_cap - 1; + std::memcpy(text_out, delta.data(), copy); + text_out[copy] = '\0'; + } + + decoded_tokens_ += static_cast(delta_ids.size()); + const bool hit_cap = + max_tokens_ > 0 && decoded_tokens_ >= max_tokens_; + /* DEVICE-VERIFY: the precise EOS signal LiteRT-LM exposes per step is + * runtime-version-dependent. A done decode yields no new delta; treat + * an empty delta or the token cap as the final step. */ + const bool eos = delta_ids.empty(); + return (hit_cap || eos) ? 1 : 0; + } + + /* cancel: publish a flag the next decode step observes. Thread-safe. */ + int cancel() override { + cancelled_.store(true, std::memory_order_release); + return ELIZA_OK; + } + + /* reset: drop a fresh Session from the same Engine (clears KV + sampler). + * Reuses the warm Engine (model weights stay resident) — only the + * per-generation Session is rebuilt. */ + int reset() override { + auto cfg = SessionConfig::CreateDefault(); + auto session = engine_->CreateSession(cfg); + if (!session.ok()) { + /* reset has no out_error param; a failed rebuild leaves the old + * session in place and surfaces on the next prefill/next. */ + return ELIZA_ERR_FFI_FAULT; + } + session_ = std::move(*session); + cancelled_.store(false, std::memory_order_release); + prefilled_ = false; + decoded_tokens_ = 0; + emitted_chars_ = 0; + return ELIZA_OK; + } + + /* reset_keep: LiteRT-LM's Session does not expose prefix-preserving KV + * trimming through this surface, so fall back to a full reset and return 0 + * (no prefix kept) — never an error (llm-backend.h contract). */ + int reset_keep(int32_t /*n_keep*/) override { + reset(); + return 0; + } + + const char * accelerator() const { return accelerator_name(accel_); } + +private: + /* The suffix of `cumulative` produced since the last emitted step. */ + std::string compute_delta(const std::string & cumulative) const { + if (cumulative.size() <= emitted_chars_) return std::string(); + return cumulative.substr(emitted_chars_); + } + + std::unique_ptr engine_; + std::unique_ptr session_; + std::atomic cancelled_{false}; + bool prefilled_ = false; + int32_t decoded_tokens_ = 0; + size_t emitted_chars_ = 0; + ResolvedAccelerator accel_ = ResolvedAccelerator::kNone; + int32_t max_tokens_ = 0; +}; + +/* ── Factory ─────────────────────────────────────────────────────────────── */ +class LiteRtBackendFactory final : public LlmBackendFactory { +public: + const char * name() const override { return LITERT_BACKEND_NAME; } + + /* available(): compiled in AND an accelerator (NPU or GPU) initializes on + * THIS host. Cheap — must not load a model. We probe by building a minimal + * EngineSettings on NPU then GPU with NO model assets; a backend whose + * delegate is missing fails settings validation. CPU alone does NOT make + * this backend "available" (CPU is the in-tree llama.cpp path's job). + * DEVICE-VERIFY: real delegate presence is only knowable on-device. */ + bool available() const override { + return probe_accelerator() != ResolvedAccelerator::kNone; + } + + /* can_serve(): a *.litertlm exists under /text/. Cheap probe, + * no caching — open() re-resolves the bundle from the context accessor. */ + bool can_serve(const char * bundle_dir) const override { + return !find_litertlm_artifact(bundle_dir).empty(); + } + + /* preference_rank(): high on Android NPU (the whole reason this backend + * exists), modest on a GPU-only fallback, 0 otherwise so llama.cpp wins. */ + int preference_rank() const override { + switch (probe_accelerator()) { + case ResolvedAccelerator::kNpu: return 100; + case ResolvedAccelerator::kGpu: return 20; + default: return 0; + } + } + + /* open(): resolve the .litertlm under the cached bundle, then walk the + * accelerator ladder NPU → GPU → CPU, recording which rung built the + * Engine. Builds a default Session and returns the streaming session. */ + LlmBackendSession * open(EliInferenceContext * ctx, + const eliza_llm_stream_config_t * cfg, + char ** out_error) override { + if (!cfg) { + litert_set_error(out_error, "[litert-lm] open: cfg is NULL"); + return nullptr; + } + const char * bundle_dir = llm_backend_context_bundle_dir(ctx); + const std::string bundle = bundle_dir ? bundle_dir : std::string(); + std::string artifact = find_litertlm_artifact(bundle.c_str()); + if (artifact.empty()) { + litert_set_error(out_error, + std::string("[litert-lm] open: no ") + LITERT_ARTIFACT_EXT + + " artifact under " + bundle + "/" + LITERT_BUNDLE_TEXT_SUBDIR); + return nullptr; + } + + /* Accelerator ladder — NPU first (Qualcomm QNN / MediaTek NeuroPilot / + * Google Tensor), then GPU (OpenCL/Metal/WebGPU), then CPU (XNNPACK). + * Each rung's failure text is preserved for the final diagnostic. + * DEVICE-VERIFY: rung availability is hardware-specific. */ + struct Rung { Backend backend; ResolvedAccelerator accel; }; + const Rung ladder[] = { + {Backend::NPU, ResolvedAccelerator::kNpu}, + {Backend::GPU, ResolvedAccelerator::kGpu}, + {Backend::CPU, ResolvedAccelerator::kCpu}, + }; + + std::unique_ptr engine; + ResolvedAccelerator resolved = ResolvedAccelerator::kNone; + std::string last_err; + for (const Rung & rung : ladder) { + engine = try_engine(artifact, rung.backend, last_err); + if (engine) { + resolved = rung.accel; + break; + } + } + if (!engine) { + litert_set_error(out_error, + std::string("[litert-lm] open: no accelerator could build the " + "engine (last error: ") + last_err + ")"); + return nullptr; + } + + auto session_cfg = SessionConfig::CreateDefault(); + auto session = engine->CreateSession(session_cfg); + if (!session.ok()) { + litert_set_error(out_error, + std::string("[litert-lm] open: CreateSession failed on ") + + accelerator_name(resolved) + ": " + + std::string(session.status().message())); + return nullptr; + } + + return new LiteRtBackendSession(std::move(engine), std::move(*session), + *cfg, resolved); + } + +private: + /* Build a no-model EngineSettings on NPU then GPU; the first whose + * delegate validates marks that rung present. Result is memoized so the + * repeated available()/preference_rank() calls are cheap. + * DEVICE-VERIFY: settings-only validation is the cheapest honest probe; + * the true delegate handshake happens at open() on-device. */ + ResolvedAccelerator probe_accelerator() const { + std::call_once(probe_once_, [this]() { + auto empty = ModelAssets::Create(std::string()); + if (!empty.ok()) { probed_ = ResolvedAccelerator::kNone; return; } + if (EngineSettings::CreateDefault(*empty, Backend::NPU).ok()) { + probed_ = ResolvedAccelerator::kNpu; + } else if (EngineSettings::CreateDefault(*empty, Backend::GPU).ok()) { + probed_ = ResolvedAccelerator::kGpu; + } else { + probed_ = ResolvedAccelerator::kNone; + } + }); + return probed_; + } + + mutable std::once_flag probe_once_; + mutable ResolvedAccelerator probed_ = ResolvedAccelerator::kNone; +}; + +} // namespace + +LlmBackendFactory * litert_backend_factory() { + static LiteRtBackendFactory factory; + return &factory; +} + +#else /* ────────────────────────── STUB (no LiteRT-LM SDK) ──────────────── */ + +/* + * Compiled-out stub: zero LiteRT-LM headers, so this builds on any host. The + * factory links in as a no-op — available() is false, can_serve() is false, + * preference_rank() is 0, and open() returns nullptr + sets `*out_error` + * "not compiled in" so the selector cleanly keeps the in-tree llama.cpp path. + */ +namespace { + +class LiteRtBackendFactoryStub final : public LlmBackendFactory { +public: + const char * name() const override { return LITERT_BACKEND_NAME; } + bool available() const override { return false; } + bool can_serve(const char * /*bundle_dir*/) const override { return false; } + int preference_rank() const override { return 0; } + + LlmBackendSession * open(EliInferenceContext * /*ctx*/, + const eliza_llm_stream_config_t * /*cfg*/, + char ** out_error) override { + litert_set_error(out_error, + "[litert-lm] backend not compiled in " + "(build with -DELIZA_ENABLE_LITERT to enable the LiteRT-LM NPU path)"); + return nullptr; + } +}; + +} // namespace + +LlmBackendFactory * litert_backend_factory() { + static LiteRtBackendFactoryStub factory; + return &factory; +} + +#endif /* ELIZA_ENABLE_LITERT */ diff --git a/tools/omnivoice/src/backends/litert-backend.h b/tools/omnivoice/src/backends/litert-backend.h new file mode 100644 index 000000000..9096b64d0 --- /dev/null +++ b/tools/omnivoice/src/backends/litert-backend.h @@ -0,0 +1,73 @@ +#pragma once +/* + * litert-backend.h — LiteRT-LM in-process streaming-LLM backend (cutover plan M4). + * + * Implements the M3 backend seam (`llm-backend.h`) on top of Google's + * LiteRT-LM C++ inference runtime, the in-process path for the Android NPU + * tier (Qualcomm QNN / MediaTek NeuroPilot / Google Tensor), with an + * optional desktop/iOS GPU fallback. LiteRT-LM is linked INTO + * `libelizainference` and exposed behind the same FFI streaming symbols — + * never a child process or TCP server (native/AGENTS.md §11, gemma4 cutover). + * + * The whole real implementation is gated behind the CMake define + * `ELIZA_ENABLE_LITERT`. When that flag is OFF this header pulls in NO + * LiteRT-LM SDK headers, so the file compiles on a host without the SDK and + * the factory links in as a no-op: `available()` is false and `open()` + * returns nullptr + sets `*out_error` "not compiled in". + * + * ── Targeted runtime API (researched 2026-06-22) ────────────────────────── + * Repo: https://github.com/google-ai-edge/LiteRT-LM (`main`) + * Docs: https://developers.google.com/edge/litert-lm/cpp + * https://ai.google.dev/edge/litert/next/litert_lm_npu + * Namespace: `litert::lm` + * + * Symbols this backend targets (verbatim from the headers above): + * - runtime/engine/engine.h + * using Engine = EngineT; + * static absl::StatusOr> + * Engine::CreateEngine(const EngineSettings&); + * absl::StatusOr> + * EngineT::CreateSession(const SessionConfig&); + * - runtime/engine/engine.h (SessionInterface) + * absl::Status RunPrefill(const std::vector&); + * absl::StatusOr RunDecode(); + * absl::StatusOr RunDecode(const DecodeConfig&); + * absl::Status GenerateContentStream( + * const std::vector&, + * absl::AnyInvocable)>); + * - runtime/engine/engine_settings.h + * static absl::StatusOr EngineSettings::CreateDefault( + * ModelAssets, Backend backend = Backend::CPU, + * std::optional vision_backend = std::nullopt, + * std::optional audio_backend = std::nullopt, + * std::optional sampler_backend = std::nullopt); + * static SessionConfig SessionConfig::CreateDefault(); + * absl::StatusOr ModelAssets::Create(); // .litertlm + * - runtime/engine/io_types.h + * using InputData = std::variant; + * class InputText { explicit InputText(std::variant); }; + * class Responses { const std::vector& GetTexts() const; }; + * - runtime/proto/engine.pb.h + * enum Backend { ... CPU, GPU, NPU, ... }; // litert::lm::Backend + * + * Accelerator ladder (Android NPU first): the factory tries NPU, then GPU, + * then CPU at `open()` and records which one initialized. Every + * hardware-gated assumption is tagged `DEVICE-VERIFY` in the .cpp — the + * accelerator ladder, the .litertlm graph fit, and tok/s can only be + * confirmed on a real NPU device, which this scaffold does not have. + */ + +#include "../llm-backend.h" + +/* Stable id matched case-insensitively against ELIZA_LLM_BACKEND, and the + * subdir + artifact extension the factory probes under /text/. */ +#define LITERT_BACKEND_NAME "litert-lm" +#define LITERT_BUNDLE_TEXT_SUBDIR "text" +#define LITERT_ARTIFACT_EXT ".litertlm" + +/* Singleton factory accessor. The selector (llm-backend-selector.cpp) calls + * this from `llm_backend_register_builtins()` to register the backend. The + * returned pointer is a static-lifetime singleton the registry does not own. + * Defined unconditionally — a build without ELIZA_ENABLE_LITERT returns a + * stub factory whose available() is false. */ +LlmBackendFactory * litert_backend_factory(); diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.h b/tools/omnivoice/src/backends/mlx-coreml-backend.h new file mode 100644 index 000000000..36d048c00 --- /dev/null +++ b/tools/omnivoice/src/backends/mlx-coreml-backend.h @@ -0,0 +1,128 @@ +#pragma once +/* + * mlx-coreml-backend.h — Apple-Silicon in-process streaming-LLM backend + * (Gemma-4 cutover plan M5). One of the alternate `LlmBackendSession` / + * `LlmBackendFactory` implementations behind the multi-runtime FFI seam + * defined in `../llm-backend.h` (cutover plan M3). + * + * Per native/AGENTS.md §11 ("one managed library, one pipe, no + * sidecar/subprocess/TCP") this backend is COMPILED INTO libelizainference + * and exposes the SAME `eliza_inference_llm_stream_*` FFI pull contract — + * it is the owned backend on Apple Silicon (mac first, iOS later), never a + * child process. Apple Foundation Models stays an opportunistic out-of- + * process adapter on the TS side and is NOT registered here. + * + * ── Two runtimes, one backend ───────────────────────────────────────────── + * + * The same `mlx-coreml` factory can serve a bundle through EITHER of two + * Apple on-device runtimes, picked at open() time from the artifact present + * under `/text/`: + * + * • MLX (PRIMARY) — Apple's array framework for Apple Silicon. We drive + * it through the C API `mlx-c` (ml-explore/mlx-c). The + * text weights are an `mlx` weights dir (safetensors, + * the mlx-lm convention) OR a `*.gguf` MLX reads via + * `mlx_load_gguf`. Decode runs the transformer graph + * on the Metal GPU stream with `mlx_quantized_matmul` + * for the quantized weight banks, + * `mlx_fast_scaled_dot_product_attention` for + * attention, and `mlx_fast_rope` for position. The KV + * cache is a pair of resident `mlx_array`s we append to + * per step (host-side cache handle, GPU-resident data). + * This is the preferred path: it gives us full control + * of the sampler, supports the Gemma SWA/shared-KV + * geometry, and matches mlx-lm's published Gemma graph. + * + * • CoreML (ALTERNATE) — Apple's MLModel runtime, which can place the graph + * on the ANE (Apple Neural Engine) as well as GPU/CPU. + * We load a compiled `*.mlmodelc` / `*.mlpackage` + * decoder and use the iOS-18 / macOS-15 **stateful** + * prediction API (`MLState`) so the KV cache lives + * inside CoreML and is updated in-place across decode + * steps (no per-token KV tensor marshalled across the + * ObjC boundary). CoreML needs Objective-C, which is + * why this whole backend is a `.mm` translation unit. + * + * TRADE-OFF (documented per the task brief): MLX is the primary path + * because it is the most flexible (custom sampler, exact Gemma geometry, + * speculative-decode-ready) and tracks mlx-lm directly; its decode runs on + * the GPU stream, not the ANE. CoreML's stateful MLModel can target the ANE + * for lower power on phones, but the decoder graph must be pre-compiled + * ahead of time, the sampler/KV layout is fixed by the converted model, and + * ANE placement of large attention graphs is fragile across OS revisions. + * We prefer MLX on mac/dev; CoreML is the alternate for ANE-bound iOS tiers + * once a stateful decoder package is published. open() selects MLX when an + * mlx weights dir / gguf is present, else falls back to the CoreML package. + * + * ── Build gate ──────────────────────────────────────────────────────────── + * + * The REAL implementation is gated behind `ELIZA_ENABLE_MLX` (the CMake + * define for this backend, per the cutover plan: LiteRT → ELIZA_ENABLE_LITERT, + * MLX/CoreML → ELIZA_ENABLE_MLX) AND `__APPLE__`. When the gate is OFF the + * translation unit includes NO Apple/MLX SDK headers, so it compiles on a + * plain Linux host: `available()` returns false, `can_serve()` returns false, + * and `open()` returns nullptr after setting `*out_error` ("not compiled in"). + * The default Linux build links it as a pure no-op and the selector skips it, + * keeping the in-tree llama.cpp path. + * + * ── API research (cited; symbols verified, not invented) ────────────────── + * + * MLX C API — ml-explore/mlx-c, `mlx/c/` headers, main @ 2026-06 (docs MLX C + * 0.4.1, https://ml-explore.github.io/mlx-c/). Symbols used by the real path: + * - device.h : `mlx_device mlx_device_new_type(mlx_device_type, int)` with + * `typedef enum { MLX_CPU, MLX_GPU } mlx_device_type;` + * - stream.h : `mlx_stream mlx_default_gpu_stream_new(void)`, + * `mlx_stream mlx_default_cpu_stream_new(void)` + * - io.h : `int mlx_load_safetensors(mlx_map_string_to_array*, + * mlx_map_string_to_string*, const char* file, mlx_stream)`, + * `int mlx_load_gguf(mlx_io_gguf*, const char* file, mlx_stream)` + * - array.h : `mlx_array mlx_array_new_data(const void*, const int* shape, + * int dim, mlx_dtype)`, `int mlx_array_eval(mlx_array)`, + * `int mlx_array_item_int32(int32_t*, mlx_array)`, + * `const float* mlx_array_data_float32(mlx_array)`, + * `int mlx_array_free(mlx_array)` + * - ops.h : `int mlx_quantized_matmul(mlx_array*, x, w, scales, biases, + * bool transpose, mlx_optional_int group_size, + * mlx_optional_int bits, const char* mode, mlx_stream)`, + * `int mlx_matmul(...)`, `int mlx_softmax_axes(...)`, + * `int mlx_argmax_axis(mlx_array*, a, int axis, bool, stream)`, + * `int mlx_take(mlx_array*, a, indices, stream)`, + * `int mlx_astype(...)`, `int mlx_concatenate(...)` + * - fast.h : `int mlx_fast_scaled_dot_product_attention(mlx_array*, q, k, + * v, float scale, const char* mask_mode, mlx_array mask, + * mlx_array sinks, mlx_stream)`, + * `int mlx_fast_rope(mlx_array*, x, int dims, bool traditional, + * mlx_optional_float base, float scale, int offset, + * mlx_array freqs, mlx_stream)` + * Gemma on MLX: ml-explore/mlx-lm (`mlx_lm/models/gemma*.py`) — the reference + * for the dense SWA + shared-KV + dual-head-dim graph this backend mirrors. + * + * CoreML stateful KV-cache — Apple Core ML, MLState API, macOS 15 / iOS 18 + * (WWDC24 "Bring your ML and AI models to Apple silicon"; coremltools + * Stateful Models guide, https://apple.github.io/coremltools/docs-guides/ + * source/stateful-models.html). ObjC symbols used: + * - `+ (nullable instancetype)modelWithContentsOfURL:(NSURL*)url + * error:(NSError**)error;` (and the compiled-model `compileModelAtURL:`) + * - `- (MLState*)newState;` (creates zeroed KV state buffers; MLState is + * +new/-init UNAVAILABLE — only MLModel vends it) + * - `- (nullable id)predictionFromFeatures: + * (id)input usingState:(MLState*)state + * error:(NSError**)error;` (the in-place stateful decode step) + * Apple's own "On-Device Llama 3.1 with Core ML" research post documents the + * prefill-then-stateful-decode loop this backend's MLX/CoreML paths follow. + * + * Every hardware-specific assumption that can only be confirmed on Apple + * Silicon is marked `DEVICE-VERIFY` in the .mm. This header carries no SDK + * dependency and is safe to include anywhere. + */ + +#include "../llm-backend.h" + +/* Free-function accessor returning the singleton `mlx-coreml` factory so the + * selector (llm-backend-selector.cpp, wired separately) can register it via + * `llm_backend_register(mlx_coreml_backend_factory())`. Defined in + * mlx-coreml-backend.mm. Always returns a valid non-null static-lifetime + * pointer — when the build gate is OFF the returned factory reports + * available()/can_serve() == false and open() == nullptr ("not compiled in"), + * so registering it unconditionally is safe. */ +LlmBackendFactory * mlx_coreml_backend_factory(); diff --git a/tools/omnivoice/src/backends/mlx-coreml-backend.mm b/tools/omnivoice/src/backends/mlx-coreml-backend.mm new file mode 100644 index 000000000..4b705d719 --- /dev/null +++ b/tools/omnivoice/src/backends/mlx-coreml-backend.mm @@ -0,0 +1,797 @@ +/* + * mlx-coreml-backend.mm — Apple-Silicon streaming-LLM backend (cutover M5). + * + * Objective-C++ translation unit: CoreML's MLModel / MLState API is + * Objective-C, and the MLX C++ / mlx-c headers also compile cleanly in a + * `.mm`. See mlx-coreml-backend.h for the full API research + citations and + * the MLX-primary / CoreML-alternate trade-off. + * + * STRUCTURE + * The whole real implementation sits behind + * #if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) + * and is the ONLY place that includes any MLX / CoreML SDK header. With the + * gate OFF (the default Linux build) this file pulls in no SDK header at all + * and compiles to a pure no-op factory: available()/can_serve() == false, + * open() returns nullptr after setting *out_error to "not compiled in". + * + * ERROR CONTRACT (native/AGENTS.md §3 + §9): never log, never return a + * defaulted result on failure. Out-error strings are heap-allocated with + * malloc (mirroring eliza-inference-ffi.cpp's `eliza_strdup`) so the FFI + * caller frees them with `eliza_inference_free_string` / free(). + */ + +#include "mlx-coreml-backend.h" + +#include +#include +#include +#include + +// =========================================================================== +// Shared (gate-independent) helpers +// =========================================================================== + +namespace { + +/* Heap-allocate an out-error string the way the FFI translation unit does + * (eliza-inference-ffi.cpp::eliza_strdup) so the caller's free() path is + * identical regardless of which backend produced the error. */ +void mlx_set_error(char ** out_error, const std::string & msg) { + if (!out_error) { + return; + } + char * out = static_cast(std::malloc(msg.size() + 1)); + if (!out) { + *out_error = nullptr; + return; + } + std::memcpy(out, msg.c_str(), msg.size() + 1); + *out_error = out; +} + +} // namespace + +// =========================================================================== +// REAL IMPLEMENTATION — Apple Silicon only, gated on ELIZA_ENABLE_MLX +// =========================================================================== +#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) + +// --- Objective-C / Apple frameworks --------------------------------------- +#import +#import // MLModel, MLState, MLFeatureProvider, MLMultiArray +#import // MTLCreateSystemDefaultDevice — Metal/ANE presence probe + +// --- MLX C API (ml-explore/mlx-c) ------------------------------------------ +// Only included behind the gate so a host without the MLX SDK still compiles. +#include "mlx/c/array.h" +#include "mlx/c/device.h" +#include "mlx/c/stream.h" +#include "mlx/c/io.h" +#include "mlx/c/ops.h" +#include "mlx/c/fast.h" +#include "mlx/c/map.h" + +#include +#include +#include +#include +#include + +namespace { + +namespace fs = std::filesystem; + +// --- bundle artifact discovery -------------------------------------------- + +enum class AppleRuntime { + None, + Mlx, // mlx weights dir (safetensors) or *.gguf under text/ + CoreMl, // *.mlmodelc / *.mlpackage under text/ +}; + +bool has_suffix(const std::string & s, const char * suffix) { + const size_t n = std::strlen(suffix); + return s.size() >= n && std::equal(s.end() - n, s.end(), suffix); +} + +/* Probe `/text/` for an Apple-servable artifact and report which + * runtime would serve it. MLX is preferred when both kinds are present (an + * mlx weights dir / gguf wins over a CoreML package), matching the header's + * "MLX primary, CoreML alternate" rule. Cheap directory walk, no model load. */ +AppleRuntime detect_runtime(const char * bundle_dir, std::string & out_artifact) { + out_artifact.clear(); + if (!bundle_dir || bundle_dir[0] == '\0') { + return AppleRuntime::None; + } + std::error_code ec; + fs::path text_dir = fs::path(bundle_dir) / "text"; + if (!fs::is_directory(text_dir, ec)) { + return AppleRuntime::None; + } + + std::string gguf, mlpackage, mlmodelc, mlx_weights_dir; + for (fs::directory_iterator it(text_dir, ec), end; it != end && !ec; it.increment(ec)) { + const fs::path & p = it->path(); + const std::string name = p.filename().string(); + if (it->is_directory(ec)) { + // mlx-lm exports an `mlx` weights dir (model.safetensors + config.json), + // or a *.mlmodelc compiled CoreML model is itself a directory. + if (has_suffix(name, ".mlmodelc")) { + if (mlmodelc.empty()) mlmodelc = p.string(); + } else if (name == "mlx" || fs::exists(p / "model.safetensors", ec) || + fs::exists(p / "weights.safetensors", ec)) { + if (mlx_weights_dir.empty()) mlx_weights_dir = p.string(); + } + } else { + if (has_suffix(name, ".gguf")) { + if (gguf.empty()) gguf = p.string(); + } else if (has_suffix(name, ".mlpackage")) { + if (mlpackage.empty()) mlpackage = p.string(); + } else if (has_suffix(name, ".safetensors")) { + if (mlx_weights_dir.empty()) mlx_weights_dir = text_dir.string(); + } + } + } + + // MLX primary: weights dir / safetensors first, then gguf. + if (!mlx_weights_dir.empty()) { out_artifact = mlx_weights_dir; return AppleRuntime::Mlx; } + if (!gguf.empty()) { out_artifact = gguf; return AppleRuntime::Mlx; } + // CoreML alternate: compiled model, then package. + if (!mlmodelc.empty()) { out_artifact = mlmodelc; return AppleRuntime::CoreMl; } + if (!mlpackage.empty()) { out_artifact = mlpackage; return AppleRuntime::CoreMl; } + return AppleRuntime::None; +} + +/* True when a Metal device (hence GPU + ANE on Apple Silicon) is present. + * DEVICE-VERIFY: on a real Apple-Silicon Mac/phone this returns a valid + * MTLDevice; on a Mac without Metal (or an unexpected host) it is nil and the + * backend reports unavailable rather than crashing at open(). */ +bool metal_device_present() { + @autoreleasepool { + id dev = MTLCreateSystemDefaultDevice(); + return dev != nil; + } +} + +// =========================================================================== +// MLX-backed session (PRIMARY) +// =========================================================================== +// +// DEVICE-VERIFY: the decode graph below is structurally complete and uses the +// real mlx-c symbols, but the exact per-layer wiring of the Gemma graph +// (alternating local-SWA / global attention, dual head dims, shared-KV layer +// reuse, Per-Layer-Embeddings) must be assembled + numerically validated on +// Apple Silicon against mlx-lm's `gemma*` reference. The weight-tensor names, +// quant group_size/bits, and rope base/scale are read from the model config at +// load; they are not hardcoded here. + +class MlxLlmSession final : public LlmBackendSession { +public: + MlxLlmSession(std::string artifact, const eliza_llm_stream_config_t * cfg) + : artifact_(std::move(artifact)) { + if (cfg) { + cfg_ = *cfg; + have_cfg_ = true; + } + } + + ~MlxLlmSession() override { + free_kv(); + // mlx_array handles are value types wrapping a refcounted ctx; freeing + // releases our reference. The Metal stream/device are process-global. + } + + /* Load weights + build the resident graph. Returns ELIZA_OK or negative. + * + * The two on-disk shapes are loaded with the two distinct mlx-c readers: + * - safetensors (mlx-lm convention): mlx_load_safetensors fills a + * mlx_map_string_to_array keyed by tensor name (looked up per-tensor + * via mlx_map_string_to_array_get when the graph is assembled); + * - gguf: mlx_load_gguf fills a mlx_io_gguf whose tensors are read by + * key via mlx_io_gguf_get_array (key list from mlx_io_gguf_get_keys). + * We keep whichever handle we loaded resident; the per-tensor pulls happen + * inside run_forward when the Gemma graph is assembled on Metal. */ + int init(char ** out_error) { + // GPU stream (Metal). DEVICE-VERIFY: requires a Metal device. + gpu_stream_ = mlx_default_gpu_stream_new(); + + int rc; + if (has_suffix(artifact_, ".gguf")) { + gguf_ = mlx_io_gguf_new(); + rc = mlx_load_gguf(&gguf_, artifact_.c_str(), gpu_stream_); + if (rc == 0) { + have_gguf_ = true; + } + } else { + // mlx weights dir / safetensors (the mlx-lm convention). + std::string file = artifact_; + std::error_code ec; + if (fs::is_directory(file, ec)) { + if (fs::exists(fs::path(file) / "model.safetensors", ec)) { + file = (fs::path(file) / "model.safetensors").string(); + } else if (fs::exists(fs::path(file) / "weights.safetensors", ec)) { + file = (fs::path(file) / "weights.safetensors").string(); + } + } + weights_ = mlx_map_string_to_array_new(); + weights_meta_ = mlx_map_string_to_string_new(); + rc = mlx_load_safetensors(&weights_, &weights_meta_, file.c_str(), gpu_stream_); + if (rc == 0) { + have_weights_ = true; + } + } + if (rc != 0) { + free_weights(); + mlx_set_error(out_error, + "[mlx-coreml] MLX failed to load weights from " + artifact_); + return ELIZA_ERR_BUNDLE_INVALID; + } + + // DEVICE-VERIFY: parse the sibling config.json (vocab, n_layer, head + // dims global/swa, sliding-window, rope base, shared-KV layer map, PLE + // table, quant bits/group_size) into graph_ here. Mirrors + // mlx_lm.utils.load's config handling. Left as the on-Metal assembly + // step — the streaming contract below does not depend on its details. + return ELIZA_OK; + } + + int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) override { + if (!have_weights_) { + mlx_set_error(out_error, "[mlx-coreml] prefill before init"); + return ELIZA_ERR_INVALID_ARG; + } + if (!token_ids || num_tokens == 0) { + mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt"); + return ELIZA_ERR_INVALID_ARG; + } + cancel_.store(false); + + // Copy the prompt (the contract says prefill copies the tokens it needs). + prompt_.assign(token_ids, token_ids + num_tokens); + n_past_ = 0; + generated_ = 0; + + // Build the [1, T] int32 input and run one forward pass that fills KV. + // DEVICE-VERIFY: run_forward() must execute the Gemma decoder over the + // whole prompt at positions [0, T) and append to the resident KV + // arrays. The final-position logits feed the first sampled token. + const int shape[2] = {1, static_cast(num_tokens)}; + mlx_array input = mlx_array_new_data(prompt_.data(), shape, 2, MLX_INT32); + int rc = run_forward(input, /*start_pos=*/0, &last_logits_, out_error); + mlx_array_free(input); + if (rc != ELIZA_OK) { + return rc; + } + n_past_ = static_cast(num_tokens); + return ELIZA_OK; + } + + int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out, + char * text_out, size_t text_cap, int32_t * drafter_drafted_out, + int32_t * drafter_accepted_out, char ** out_error) override { + if (num_tokens_out) *num_tokens_out = 0; + if (text_out && text_cap) text_out[0] = '\0'; + // No speculative drafter on the MLX path yet (M6 wires MTP). + if (drafter_drafted_out) *drafter_drafted_out = 0; + if (drafter_accepted_out) *drafter_accepted_out = 0; + + if (!have_weights_) { + mlx_set_error(out_error, "[mlx-coreml] next before init/prefill"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancel_.load()) { + return ELIZA_ERR_CANCELLED; + } + if (!tokens_out || tokens_cap == 0) { + mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small"); + return ELIZA_ERR_INVALID_ARG; + } + + // Sample one token from last_logits_ (greedy here; temperature / top-p / + // top-k from cfg_ applied in sample_token). + // DEVICE-VERIFY: sample_token reads last_logits_ (an mlx_array of shape + // [1, vocab]) and returns one int32 token id. + int32_t next_id = 0; + int rc = sample_token(last_logits_, &next_id, out_error); + if (rc != ELIZA_OK) { + return rc; + } + + tokens_out[0] = next_id; + if (num_tokens_out) *num_tokens_out = 1; + generated_++; + + // Detokenize the single committed token into text_out (UTF-8). + // DEVICE-VERIFY: detokenize_piece resolves next_id against the model's + // vocab (loaded from the tokenizer sidecar / gguf vocab) and writes the + // UTF-8 piece. Partial multi-byte pieces are buffered across calls. + detokenize_piece(next_id, text_out, text_cap); + + const bool hit_eos = is_eos(next_id); + const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0) + ? cfg_.max_tokens + : default_max_tokens_; + const bool hit_cap = generated_ >= cap; + if (hit_eos || hit_cap) { + return 1; // final step + } + + // Advance one position: forward pass for the just-sampled token only. + const int shape[2] = {1, 1}; + mlx_array step_in = mlx_array_new_data(&next_id, shape, 2, MLX_INT32); + rc = run_forward(step_in, /*start_pos=*/n_past_, &last_logits_, out_error); + mlx_array_free(step_in); + if (rc != ELIZA_OK) { + return rc; + } + n_past_++; + return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more + } + + int cancel() override { + cancel_.store(true); + return ELIZA_OK; + } + + int reset() override { + cancel_.store(false); + prompt_.clear(); + n_past_ = 0; + generated_ = 0; + free_kv(); // drop resident KV arrays + free_logits(); + return ELIZA_OK; + } + + int reset_keep(int32_t n_keep) override { + // MLX KV is a resident pair of arrays we append to; trimming to a prefix + // is a tensor slice. DEVICE-VERIFY: when the on-Metal KV slice is wired, + // keep [0, n_keep) of the K/V arrays and set n_past_ = clamp(n_keep). + // Until that lands, do the contract-mandated SAFE fallback: full reset, + // return 0 — never an error (llm-backend.h reset_keep contract). + (void) n_keep; + reset(); + return 0; + } + +private: + void free_kv() { + if (have_kv_) { + mlx_array_free(kv_k_); + mlx_array_free(kv_v_); + have_kv_ = false; + } + } + void free_logits() { + if (have_logits_) { + mlx_array_free(last_logits_); + have_logits_ = false; + } + } + + /* One transformer forward pass over `input` ([1, T] int32) starting at + * position `start_pos`, appending to the resident KV cache and writing the + * final-position logits ([1, vocab]) into *out_logits. + * + * DEVICE-VERIFY: this is the Gemma decoder graph. It must, per layer: + * - embed tokens (+ Per-Layer-Embeddings) ; + * - apply mlx_fast_rope with the layer's (global vs SWA) head dim ; + * - run mlx_fast_scaled_dot_product_attention with mask_mode "causal" for + * global layers and a windowed mask for SWA layers ; + * - reuse earlier-layer KV on shared-KV layers ; + * - mlx_quantized_matmul for quantized weight banks (group_size/bits from + * config), mlx_matmul for f16 banks ; + * - mlx_array_eval the result on gpu_stream_ to force materialization. + * The scaffolding owns the resident-KV bookkeeping; the per-op assembly is + * the on-Metal step validated against mlx-lm. */ + int run_forward(mlx_array /*input*/, int /*start_pos*/, mlx_array * out_logits, + char ** out_error) { + // Until the on-Metal graph is assembled, surface a precise, non-default + // failure (§3: never return a defaulted result). When the graph lands, + // this returns ELIZA_OK with *out_logits set and the KV appended. + free_logits(); + (void) out_logits; + mlx_set_error(out_error, + "[mlx-coreml] MLX Gemma decode graph not assembled on this build " + "(DEVICE-VERIFY: requires Apple Silicon)"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + int sample_token(mlx_array logits, int32_t * out_id, char ** out_error) { + if (!have_logits_) { + mlx_set_error(out_error, "[mlx-coreml] no logits to sample"); + return ELIZA_ERR_INVALID_ARG; + } + // DEVICE-VERIFY: apply cfg_.temperature / top_p / top_k / repeat_penalty + // then categorical sample; greedy argmax shown as the structural default. + mlx_array arg = mlx_array_new(); + if (mlx_argmax_axis(&arg, logits, /*axis=*/-1, /*keepdims=*/false, gpu_stream_) != 0) { + mlx_array_free(arg); + mlx_set_error(out_error, "[mlx-coreml] argmax failed"); + return ELIZA_ERR_FFI_FAULT; + } + mlx_array_eval(arg); + int32_t id = 0; + const int rc = mlx_array_item_int32(&id, arg); + mlx_array_free(arg); + if (rc != 0) { + mlx_set_error(out_error, "[mlx-coreml] failed to read sampled token"); + return ELIZA_ERR_FFI_FAULT; + } + *out_id = id; + return ELIZA_OK; + } + + bool is_eos(int32_t id) const { + // DEVICE-VERIFY: compare against the model's EOS / ids + // (Gemma uses ) read from the tokenizer config at load. + return id == eos_id_; + } + + void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) { + // DEVICE-VERIFY: resolve the token piece from the loaded vocab and copy + // its UTF-8 bytes (buffering partial code points across calls). The + // empty string here keeps the contract intact (committed id is already + // in tokens_out) until the vocab path is wired. + if (text_out && text_cap) { + text_out[0] = '\0'; + } + } + + std::string artifact_; + eliza_llm_stream_config_t cfg_{}; + bool have_cfg_ = false; + + mlx_stream gpu_stream_{}; + mlx_map_string_to_array weights_{}; + mlx_map_string_to_string weights_meta_{}; + bool have_weights_ = false; + + mlx_array kv_k_{}; + mlx_array kv_v_{}; + bool have_kv_ = false; + + mlx_array last_logits_{}; + bool have_logits_ = false; + + std::vector prompt_; + int n_past_ = 0; + int generated_ = 0; + int32_t eos_id_ = -1; + int32_t default_max_tokens_ = 2048; + + std::atomic cancel_{false}; +}; + +// =========================================================================== +// CoreML-backed session (ALTERNATE — ANE-bound, stateful MLState KV cache) +// =========================================================================== +// +// DEVICE-VERIFY: the converted decoder package must expose (a) an input +// feature for the current token id(s) and position, (b) an MLState-backed KV +// cache, and (c) a logits output. Apple's "On-Device Llama 3.1 with Core ML" +// post is the reference for the prefill-then-stateful-decode loop. We hold the +// MLModel + its MLState and call predictionFromFeatures:usingState:error: per +// step so the KV updates in-place inside CoreML (no per-token KV marshalling). + +class CoreMlLlmSession final : public LlmBackendSession { +public: + CoreMlLlmSession(std::string package, const eliza_llm_stream_config_t * cfg) + : package_(std::move(package)) { + if (cfg) { + cfg_ = *cfg; + have_cfg_ = true; + } + } + + ~CoreMlLlmSession() override { + @autoreleasepool { + state_ = nil; + model_ = nil; + } + } + + int init(char ** out_error) { + @autoreleasepool { + NSError * err = nil; + NSURL * url = [NSURL fileURLWithPath: + [NSString stringWithUTF8String:package_.c_str()]]; + + NSURL * compiled = url; + // A *.mlpackage must be compiled to *.mlmodelc before loading; a + // *.mlmodelc loads directly. DEVICE-VERIFY: compileModelAtURL is a + // synchronous one-time compile; production caches the result. + if ([package_.c_str() ? @(package_.c_str()) : @"" hasSuffix:@".mlpackage"]) { + NSURL * c = [MLModel compileModelAtURL:url error:&err]; + if (!c) { + mlx_set_error(out_error, std::string( + "[mlx-coreml] CoreML compile failed: ") + + (err ? err.localizedDescription.UTF8String : "unknown")); + return ELIZA_ERR_BUNDLE_INVALID; + } + compiled = c; + } + + MLModelConfiguration * conf = [[MLModelConfiguration alloc] init]; + // DEVICE-VERIFY: .all lets CoreML place the decoder on ANE when the + // converted graph is ANE-eligible, else GPU/CPU. + conf.computeUnits = MLComputeUnitsAll; + + model_ = [MLModel modelWithContentsOfURL:compiled + configuration:conf + error:&err]; + if (!model_) { + mlx_set_error(out_error, std::string( + "[mlx-coreml] CoreML model load failed: ") + + (err ? err.localizedDescription.UTF8String : "unknown")); + return ELIZA_ERR_BUNDLE_INVALID; + } + + // newState vends zeroed KV buffers; MLState is +new/-init + // UNAVAILABLE — only MLModel produces it (macOS 15 / iOS 18). + state_ = [model_ newState]; + if (!state_) { + mlx_set_error(out_error, + "[mlx-coreml] CoreML model has no stateful KV cache " + "(newState returned nil) — needs a stateful decoder package"); + return ELIZA_ERR_BUNDLE_INVALID; + } + return ELIZA_OK; + } + } + + int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) override { + if (!model_ || !state_) { + mlx_set_error(out_error, "[mlx-coreml] prefill before init"); + return ELIZA_ERR_INVALID_ARG; + } + if (!token_ids || num_tokens == 0) { + mlx_set_error(out_error, "[mlx-coreml] prefill: empty prompt"); + return ELIZA_ERR_INVALID_ARG; + } + cancel_.store(false); + prompt_.assign(token_ids, token_ids + num_tokens); + n_past_ = 0; + generated_ = 0; + + // DEVICE-VERIFY: feed the whole prompt as one prediction with positions + // [0, T) so CoreML fills the MLState KV in one pass, then keep the + // final-position logits for the first sampled token. The feature names + // ("input_ids", "position", "logits") are dictated by the converted + // model's MLModelDescription — read them from model_.modelDescription. + return run_step(prompt_.data(), prompt_.size(), /*start_pos=*/0, out_error); + } + + int next(int32_t * tokens_out, size_t tokens_cap, size_t * num_tokens_out, + char * text_out, size_t text_cap, int32_t * drafter_drafted_out, + int32_t * drafter_accepted_out, char ** out_error) override { + if (num_tokens_out) *num_tokens_out = 0; + if (text_out && text_cap) text_out[0] = '\0'; + if (drafter_drafted_out) *drafter_drafted_out = 0; + if (drafter_accepted_out) *drafter_accepted_out = 0; + + if (!model_ || !state_) { + mlx_set_error(out_error, "[mlx-coreml] next before init/prefill"); + return ELIZA_ERR_INVALID_ARG; + } + if (cancel_.load()) { + return ELIZA_ERR_CANCELLED; + } + if (!tokens_out || tokens_cap == 0) { + mlx_set_error(out_error, "[mlx-coreml] next: token buffer too small"); + return ELIZA_ERR_INVALID_ARG; + } + + int32_t next_id = 0; + int rc = sample_from_last_logits(&next_id, out_error); + if (rc != ELIZA_OK) { + return rc; + } + tokens_out[0] = next_id; + if (num_tokens_out) *num_tokens_out = 1; + generated_++; + detokenize_piece(next_id, text_out, text_cap); + + const int32_t cap = (have_cfg_ && cfg_.max_tokens > 0) + ? cfg_.max_tokens + : default_max_tokens_; + if (is_eos(next_id) || generated_ >= cap) { + return 1; // final + } + + // One stateful decode step for the just-sampled token. + const int32_t one = next_id; + rc = run_step(&one, 1, /*start_pos=*/n_past_, out_error); + if (rc != ELIZA_OK) { + return rc; + } + n_past_++; + return cancel_.load() ? ELIZA_ERR_CANCELLED : 0; // more + } + + int cancel() override { + cancel_.store(true); + return ELIZA_OK; + } + + int reset() override { + cancel_.store(false); + prompt_.clear(); + n_past_ = 0; + generated_ = 0; + @autoreleasepool { + // A fresh MLState zeroes the KV cache — the canonical CoreML reset. + if (model_) { + state_ = [model_ newState]; + } + } + return ELIZA_OK; + } + + int reset_keep(int32_t n_keep) override { + // CoreML's MLState is opaque: there is no public API to truncate the KV + // to a prefix. Per the llm-backend.h contract, fall back to a full + // reset and return 0 — never an error. + (void) n_keep; + reset(); + return 0; + } + +private: + /* Run one prediction (`n` tokens starting at `start_pos`) through the + * stateful model, updating the MLState KV in place and caching the + * final-position logits. DEVICE-VERIFY: builds an MLFeatureProvider from + * the converted model's actual input descriptions and reads the logits + * MLMultiArray from the output provider. */ + int run_step(const int32_t * /*tokens*/, size_t /*n*/, int /*start_pos*/, + char ** out_error) { + // The feature-name binding is model-specific and only knowable from a + // real converted package, so surface a precise failure (§3) rather than + // a defaulted success. When the package is wired this calls + // predictionFromFeatures:usingState:error: and stores the logits. + mlx_set_error(out_error, + "[mlx-coreml] CoreML stateful decode not bound to a converted " + "decoder package on this build (DEVICE-VERIFY: requires a stateful " + "*.mlmodelc and Apple Silicon)"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + int sample_from_last_logits(int32_t * /*out_id*/, char ** out_error) { + // DEVICE-VERIFY: argmax / temperature-sample over the cached logits + // MLMultiArray. Fails precisely until run_step populates them. + mlx_set_error(out_error, "[mlx-coreml] no CoreML logits to sample"); + return ELIZA_ERR_NOT_IMPLEMENTED; + } + + bool is_eos(int32_t id) const { return id == eos_id_; } + + void detokenize_piece(int32_t /*id*/, char * text_out, size_t text_cap) { + if (text_out && text_cap) { + text_out[0] = '\0'; + } + } + + std::string package_; + eliza_llm_stream_config_t cfg_{}; + bool have_cfg_ = false; + + MLModel * model_ = nil; + MLState * state_ = nil; + + std::vector prompt_; + int n_past_ = 0; + int generated_ = 0; + int32_t eos_id_ = -1; + int32_t default_max_tokens_ = 2048; + + std::atomic cancel_{false}; +}; + +// =========================================================================== +// Factory (real) +// =========================================================================== + +class MlxCoreMlFactory final : public LlmBackendFactory { +public: + const char * name() const override { return "mlx-coreml"; } + + bool available() const override { + // Compiled in (we are inside the gate) AND a Metal device is present. + // DEVICE-VERIFY: true on Apple Silicon; false on a Mac without Metal. + return metal_device_present(); + } + + bool can_serve(const char * bundle_dir) const override { + std::string artifact; + return detect_runtime(bundle_dir, artifact) != AppleRuntime::None; + } + + int preference_rank() const override { + // Highest on Apple Silicon: the in-process Metal/ANE path beats the + // in-tree llama.cpp Metal path for the Gemma geometry. > LiteRT(0 here). + return 100; + } + + LlmBackendSession * open(EliInferenceContext * ctx, + const eliza_llm_stream_config_t * cfg, + char ** out_error) override { + // Resolve the bundle root from the context accessor (the struct is + // otherwise opaque here), then pick MLX vs CoreML from its artifacts. + const char * bundle_dir = llm_backend_context_bundle_dir(ctx); + const std::string bundle = bundle_dir ? bundle_dir : std::string(); + if (bundle.empty()) { + mlx_set_error(out_error, + "[mlx-coreml] open: context has no bundle dir"); + return nullptr; + } + std::string artifact; + const AppleRuntime rt = detect_runtime(bundle.c_str(), artifact); + if (rt == AppleRuntime::Mlx) { + auto * s = new MlxLlmSession(artifact, cfg); + const int rc = s->init(out_error); + if (rc != ELIZA_OK) { + delete s; + return nullptr; + } + return s; + } + if (rt == AppleRuntime::CoreMl) { + auto * s = new CoreMlLlmSession(artifact, cfg); + const int rc = s->init(out_error); + if (rc != ELIZA_OK) { + delete s; + return nullptr; + } + return s; + } + mlx_set_error(out_error, + "[mlx-coreml] open: bundle has no MLX/CoreML text artifact under text/"); + return nullptr; + } +}; + +MlxCoreMlFactory g_factory; + +} // namespace + +LlmBackendFactory * mlx_coreml_backend_factory() { + return &g_factory; +} + +// =========================================================================== +// STUB IMPLEMENTATION — every non-Apple / gate-OFF build +// =========================================================================== +#else // !(ELIZA_ENABLE_MLX && __APPLE__) + +namespace { + +/* No SDK header is included on this path, so the file compiles on a plain + * Linux host. The factory reports itself unavailable and refuses to open. */ +class MlxCoreMlStubFactory final : public LlmBackendFactory { +public: + const char * name() const override { return "mlx-coreml"; } + bool available() const override { return false; } + bool can_serve(const char * /*bundle_dir*/) const override { return false; } + int preference_rank() const override { return 0; } + + LlmBackendSession * open(EliInferenceContext * /*ctx*/, + const eliza_llm_stream_config_t * /*cfg*/, + char ** out_error) override { + mlx_set_error(out_error, + "[mlx-coreml] backend not compiled in " + "(needs -DELIZA_ENABLE_MLX on Apple Silicon)"); + return nullptr; + } +}; + +MlxCoreMlStubFactory g_stub_factory; + +} // namespace + +LlmBackendFactory * mlx_coreml_backend_factory() { + return &g_stub_factory; +} + +#endif // ELIZA_ENABLE_MLX && __APPLE__ diff --git a/tools/omnivoice/src/eliza-inference-ffi.cpp b/tools/omnivoice/src/eliza-inference-ffi.cpp index 345c87cb0..94127affc 100644 --- a/tools/omnivoice/src/eliza-inference-ffi.cpp +++ b/tools/omnivoice/src/eliza-inference-ffi.cpp @@ -14,6 +14,7 @@ // resolve `eliza_inference_*` symbols from this object. #include "eliza-inference-ffi.h" +#include "llm-backend.h" #include "omnivoice.h" #include "llama.h" #include "mtmd.h" @@ -173,6 +174,13 @@ struct EliInferenceContext { #endif }; +/* M3 seam accessor (declared in llm-backend.h): hand a backend's open() the + * bundle root without exposing the struct. Defined here where the type is + * complete. */ +const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx) { + return ctx ? ctx->bundle_dir.c_str() : nullptr; +} + /* ELZ2 magic 'ELZ1' (the ascii bytes 'E','L','Z','1' little-endian). * The magic stays 'ELZ1' across format versions — only the version * word at offset 4 changes between v1 and v2. */ @@ -1135,6 +1143,11 @@ static void reset_engine(Engine * e) { struct EliLlmStream { EliInferenceContext * ctx = nullptr; + /* Multi-backend seam (M3): when non-NULL, this session is driven by an + * alternate in-process runtime (LiteRT-LM / MLX-CoreML) and the llama.cpp + * fields below (lctx/sampler/mtp) are unused — every FFI streaming entry + * delegates to `backend` and returns before touching the llama.cpp path. */ + LlmBackendSession * backend = nullptr; llama_context * lctx = nullptr; llama_sampler * sampler = nullptr; int n_past = 0; @@ -2887,6 +2900,40 @@ EliLlmStream * eliza_inference_llm_stream_open( return nullptr; } + /* Multi-backend seam (M3): an alternate in-process runtime (LiteRT-LM / + * MLX-CoreML) may serve this bundle. The selector returns nullptr with NO + * error to keep the in-tree llama.cpp path below; nullptr WITH an error is a + * hard env-select failure to propagate. */ + { + char * sel_err = nullptr; + LlmBackendFactory * factory = + llm_backend_select(ctx->bundle_dir.c_str(), cfg, &sel_err); + if (!factory && sel_err) { + if (out_error) { + *out_error = sel_err; + } else { + eliza_inference_free_string(sel_err); + } + return nullptr; + } + if (factory) { + EliLlmStream * bstream = new (std::nothrow) EliLlmStream(); + if (!bstream) { + eliza_set_error(out_error, + "[libelizainference] llm_stream_open: out of memory"); + return nullptr; + } + bstream->ctx = ctx; + bstream->max_tokens = cfg->max_tokens > 0 ? cfg->max_tokens : 0; + bstream->backend = factory->open(ctx, cfg, out_error); + if (!bstream->backend) { + delete bstream; + return nullptr; + } + return bstream; + } + } + llama_model * model = nullptr; { std::lock_guard lock(ctx->llm_mutex); @@ -2988,6 +3035,9 @@ int eliza_inference_llm_stream_prefill( const int32_t * token_ids, size_t num_tokens, char ** out_error) { + if (stream && stream->backend) { + return stream->backend->prefill(token_ids, num_tokens, out_error); + } if (!stream || (!stream->lctx && !stream->mtp)) { eliza_set_error(out_error, "[libelizainference] llm_stream_prefill: invalid session"); @@ -3056,6 +3106,11 @@ int eliza_inference_llm_stream_next( if (drafter_accepted_out) *drafter_accepted_out = 0; if (text_out && text_cap > 0) text_out[0] = '\0'; + if (stream && stream->backend) { + return stream->backend->next(tokens_out, tokens_cap, num_tokens_out, + text_out, text_cap, drafter_drafted_out, + drafter_accepted_out, out_error); + } if (!stream || (!stream->mtp && (!stream->lctx || !stream->sampler))) { eliza_set_error(out_error, "[libelizainference] llm_stream_next: invalid session"); @@ -3245,6 +3300,9 @@ int eliza_inference_llm_stream_next( } int eliza_inference_llm_stream_cancel(EliLlmStream * stream) { + if (stream && stream->backend) { + return stream->backend->cancel(); + } if (stream) { stream->cancel.store(true, std::memory_order_release); } @@ -3255,6 +3313,9 @@ int eliza_inference_llm_stream_save_slot( EliLlmStream * stream, const char * filename, char ** out_error) { + if (stream && stream->backend) { + return stream->backend->save_slot(filename, out_error); + } (void) stream; (void) filename; /* v1: cross-launch slot KV persistence is not wired. Return a structured @@ -3269,6 +3330,9 @@ int eliza_inference_llm_stream_restore_slot( EliLlmStream * stream, const char * filename, char ** out_error) { + if (stream && stream->backend) { + return stream->backend->restore_slot(filename, out_error); + } (void) stream; (void) filename; eliza_set_error(out_error, @@ -3285,6 +3349,7 @@ int eliza_inference_llm_stream_reset(EliLlmStream * stream) { * created/destroyed repeatedly. Handles both the plain fixed-KV stream and * the MTP speculative engine (which owns its own target/draft KV). */ if (!stream) return ELIZA_ERR_INVALID_ARG; + if (stream->backend) return stream->backend->reset(); if (!stream->mtp && !stream->lctx) return ELIZA_ERR_INVALID_ARG; if (stream->mtp) { /* MTP stream: clear both the target and draft KV caches, reset the @@ -3319,6 +3384,7 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep) * separate (riskier) handling — prefix-reuse mode opens the resident stream * without MTP, trading MTP's ~1.5x decode for the much larger prefill cut. */ if (!stream) return ELIZA_ERR_INVALID_ARG; + if (stream->backend) return stream->backend->reset_keep(n_keep); if (stream->mtp || !stream->lctx) return ELIZA_ERR_INVALID_ARG; if (n_keep < 0) n_keep = 0; if (n_keep > stream->n_past) n_keep = stream->n_past; @@ -3339,6 +3405,10 @@ int eliza_inference_llm_stream_reset_keep(EliLlmStream * stream, int32_t n_keep) void eliza_inference_llm_stream_close(EliLlmStream * stream) { if (!stream) return; + if (stream->backend) { + delete stream->backend; + stream->backend = nullptr; + } if (stream->mtp) { eliza_mtp::free_engine(stream->mtp); stream->mtp = nullptr; diff --git a/tools/omnivoice/src/llm-backend-selector.cpp b/tools/omnivoice/src/llm-backend-selector.cpp new file mode 100644 index 000000000..fa5fa703c --- /dev/null +++ b/tools/omnivoice/src/llm-backend-selector.cpp @@ -0,0 +1,140 @@ +/* + * llm-backend-selector.cpp — registry + selection for the multi-runtime + * streaming-LLM seam (cutover plan M3). + * + * On a default build (no -DELIZA_ENABLE_* gate) NO alternate backend is + * registered, so llm_backend_select() always returns nullptr and the FFI keeps + * the in-tree llama.cpp path. The seam is therefore inert-by-default: the + * library behaves exactly as before until an accelerator backend is compiled in. + */ + +#include "llm-backend.h" + +#include +#include +#include +#include +#include +#include + +/* Gated backend factory accessors. Declared only when the matching backend is + * compiled in; register_builtins() calls them under the same gate. Keeping the + * declarations gated means the default build has no unresolved symbols. */ +#ifdef ELIZA_ENABLE_LITERT +LlmBackendFactory * litert_backend_factory(); +#endif +#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) +LlmBackendFactory * mlx_coreml_backend_factory(); +#endif + +namespace { + +std::mutex g_reg_mutex; +std::vector g_factories; +std::once_flag g_builtins_once; + +/* Heap-allocate an error string with malloc so the caller can release it with + * eliza_inference_free_string() (which calls free()), matching the FFI contract. */ +char * dup_error(const std::string & msg) { + char * out = (char *) std::malloc(msg.size() + 1); + if (out) std::memcpy(out, msg.c_str(), msg.size() + 1); + return out; +} + +bool iequals(const char * a, const char * b) { + if (!a || !b) return false; + while (*a && *b) { + if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) { + return false; + } + ++a; + ++b; + } + return *a == *b; +} + +bool is_llamacpp_name(const char * s) { + return iequals(s, "llama.cpp") || iequals(s, "llamacpp") || iequals(s, "llama"); +} + +} // namespace + +void llm_backend_register(LlmBackendFactory * factory) { + if (!factory) return; + std::lock_guard lock(g_reg_mutex); + for (LlmBackendFactory * f : g_factories) { + if (iequals(f->name(), factory->name())) return; /* idempotent by name */ + } + g_factories.push_back(factory); +} + +void llm_backend_register_builtins() { + std::call_once(g_builtins_once, []() { +#ifdef ELIZA_ENABLE_LITERT + llm_backend_register(litert_backend_factory()); +#endif +#if defined(ELIZA_ENABLE_MLX) && defined(__APPLE__) + llm_backend_register(mlx_coreml_backend_factory()); +#endif + }); +} + +LlmBackendFactory * llm_backend_select(const char * bundle_dir, + const eliza_llm_stream_config_t * /*cfg*/, + char ** out_error) { + llm_backend_register_builtins(); + + /* (1) ELIZA_LLM_BACKEND env: a HARD select. */ + const char * forced = std::getenv("ELIZA_LLM_BACKEND"); + if (forced && forced[0] != '\0') { + if (is_llamacpp_name(forced)) { + return nullptr; /* force the in-tree path, not an error */ + } + std::lock_guard lock(g_reg_mutex); + for (LlmBackendFactory * f : g_factories) { + if (!iequals(f->name(), forced)) continue; + if (!f->available()) { + if (out_error) { + *out_error = dup_error( + std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced + + " is not available in this build/host"); + } + return nullptr; + } + if (!f->can_serve(bundle_dir)) { + if (out_error) { + *out_error = dup_error( + std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced + + " cannot serve the bundle at " + + (bundle_dir ? bundle_dir : "(null)")); + } + return nullptr; + } + return f; + } + if (out_error) { + *out_error = dup_error( + std::string("[libelizainference] ELIZA_LLM_BACKEND=") + forced + + " is not a registered backend"); + } + return nullptr; + } + + /* (2) Auto-select: the highest preference_rank among available + can_serve. + * The in-tree llama.cpp path is the implicit rank-0 fallback, so an + * accelerator backend only wins when it returns a positive rank AND can + * serve this bundle. */ + std::lock_guard lock(g_reg_mutex); + LlmBackendFactory * best = nullptr; + int best_rank = 0; + for (LlmBackendFactory * f : g_factories) { + if (!f->available()) continue; + if (!f->can_serve(bundle_dir)) continue; + const int rank = f->preference_rank(); + if (rank > best_rank) { + best_rank = rank; + best = f; + } + } + return best; /* nullptr => in-tree llama.cpp */ +} diff --git a/tools/omnivoice/src/llm-backend.h b/tools/omnivoice/src/llm-backend.h new file mode 100644 index 000000000..0fad67f3c --- /dev/null +++ b/tools/omnivoice/src/llm-backend.h @@ -0,0 +1,167 @@ +#pragma once +/* + * llm-backend.h — multi-runtime streaming-LLM backend seam (cutover plan M3). + * + * The libelizainference streaming-LLM FFI (`eliza_inference_llm_stream_*`) is + * ONE pipe that can be driven by more than one in-process inference runtime: + * + * - llama.cpp — the default / reference backend (CPU / CUDA / Vulkan-Mali- + * Adreno / Metal). Always present; the in-tree code path. + * - LiteRT-LM — Android NPU (Tensor / Qualcomm QNN / MediaTek NeuroPilot), + * optionally desktop/iOS GPU. Gated -DELIZA_ENABLE_LITERT. + * - CoreML/MLX — Apple Silicon (mac first, iOS later). Gated -DELIZA_ENABLE_MLX. + * + * Per native/AGENTS.md §11 (reinterpreted by the Gemma-4 cutover): "one managed + * library, one pipe, no sidecar/subprocess/TCP." LiteRT-LM and MLX are + * EMBEDDABLE in-process C++ libraries linked INTO libelizainference and exposed + * behind the SAME FFI streaming symbols — never a child process or TCP server. + * (AICore / Apple Foundation stay opportunistic out-of-process adapters on the + * TS side, not owned backends — they are NOT registered here.) + * + * A backend supplies: + * - LlmBackendSession — the per-generation streaming session, mirroring the + * FFI pull contract (prefill -> next* -> close) 1:1 so + * the FFI functions delegate without translation. + * - LlmBackendFactory — names the runtime, reports availability + bundle fit, + * and opens sessions. + * + * `llm_backend_select()` picks a backend at `_open` time from the platform, the + * bundle contents, the build flags, and the `ELIZA_LLM_BACKEND` env override. + * When it returns nullptr (and no error) the FFI keeps the in-tree llama.cpp + * path — so a build with no alternate backend behaves exactly as before. + */ + +#include "eliza-inference-ffi.h" /* eliza_llm_stream_config_t, EliInferenceContext fwd */ + +#include +#include + +/* Defined in the FFI translation unit. Opaque to backends — a backend reaches + * the resident model/bundle through the accessors below, not the struct. */ +struct EliInferenceContext; + +/* The bundle directory the context was opened against. A backend's open() + * resolves its own artifact under this root (e.g. `/text/*.litertlm`, + * `/text/*.mlpackage`) — the ONLY supported way to read the bundle path, + * since the struct itself is opaque here. Returns nullptr when ctx is null. + * Defined in eliza-inference-ffi.cpp; the pointer is owned by the context and + * stays valid for the session's lifetime. */ +const char * llm_backend_context_bundle_dir(const EliInferenceContext * ctx); + +/* ---- Per-generation streaming session ------------------------------------ * + * + * Lifetime: created by LlmBackendFactory::open(), destroyed via `delete` on the + * FFI `_close` path. Every method mirrors the matching FFI entry point so the + * FFI can `return session->method(...)` with no argument translation. Status + * conventions are identical to the FFI: >= 0 on success, the negative `ELIZA_*` + * constants on failure, with `*out_error` heap-allocated for the caller to free. + */ +struct LlmBackendSession { + virtual ~LlmBackendSession() = default; + + /* Mirrors eliza_inference_llm_stream_prefill. Copies the tokens it needs. */ + virtual int prefill(const int32_t * token_ids, size_t num_tokens, + char ** out_error) = 0; + + /* Mirrors eliza_inference_llm_stream_next. Returns 0 (more output), 1 (final + * step — EOS / cap), or a negative ELIZA_* code (ELIZA_ERR_CANCELLED on + * cancel). `drafter_*_out` carry per-step speculative stats (0 when the + * backend has no drafter). */ + virtual int next(int32_t * tokens_out, size_t tokens_cap, + size_t * num_tokens_out, char * text_out, size_t text_cap, + int32_t * drafter_drafted_out, int32_t * drafter_accepted_out, + char ** out_error) = 0; + + /* Mirrors eliza_inference_llm_stream_cancel. Publishes a flag an in-flight + * next() checks at a step boundary; safe to call from another thread. + * Returns ELIZA_OK whether or not a pass was running. */ + virtual int cancel() = 0; + + /* Mirrors eliza_inference_llm_stream_reset: clear KV + sampler/counters so + * the next prefill starts a fresh prompt on the same warm session. */ + virtual int reset() = 0; + + /* Mirrors eliza_inference_llm_stream_reset_keep: keep the first `n_keep` + * tokens of state resident and drop the rest. Returns the n_keep actually + * applied (>= 0, may be clamped / 0 on a full-reset fallback). A backend + * that cannot do prefix reuse MUST fall back to a full reset and return 0 — + * never an error. */ + virtual int reset_keep(int32_t n_keep) = 0; + + /* Slot KV persistence — optional. Default: not supported. */ + virtual int save_slot(const char * /*filename*/, char ** /*out_error*/) { + return ELIZA_ERR_INVALID_ARG; + } + virtual int restore_slot(const char * /*filename*/, char ** /*out_error*/) { + return ELIZA_ERR_INVALID_ARG; + } +}; + +/* ---- Backend factory (one per linked-in runtime) ------------------------- */ +struct LlmBackendFactory { + virtual ~LlmBackendFactory() = default; + + /* Stable lower-case id: "llama.cpp", "litert-lm", "mlx-coreml". Matched + * case-insensitively against ELIZA_LLM_BACKEND. */ + virtual const char * name() const = 0; + + /* True only when this backend is compiled in AND its runtime dependencies + * are present on THIS host (the NPU delegate / Metal device / the linked + * lib). A scaffold whose build gate is OFF returns false. Cheap — must not + * load a model. */ + virtual bool available() const = 0; + + /* True when this backend can serve the bundle at `bundle_dir` — i.e. the + * backend-specific artifact exists (e.g. `text/*.litertlm`, `text/*.mlpackage`). + * Cheap directory probe, no model load. */ + virtual bool can_serve(const char * bundle_dir) const = 0; + + /* Platform-affinity rank used to order candidates when several can serve the + * same bundle and no env override is set. Higher wins. The in-tree llama.cpp + * path is rank 0 (the implicit fallback); an accelerator backend that is the + * preferred path on this device returns a positive value. */ + virtual int preference_rank() const { return 0; } + + /* Create a streaming session for (ctx, cfg). Returns nullptr + `*out_error` + * on failure. The returned session is owned by the caller (FFI `_close` + * deletes it). */ + virtual LlmBackendSession * open(EliInferenceContext * ctx, + const eliza_llm_stream_config_t * cfg, + char ** out_error) = 0; +}; + +/* ---- Registry + selection ------------------------------------------------ * + * + * Backends register their singleton factory (idempotent; the registry does not + * take ownership — factories are static-lifetime singletons). The FFI + * translation unit calls llm_backend_register_builtins() once to register every + * compiled-in backend, then calls llm_backend_select() per `_open`. + */ + +/* Register a factory (idempotent by name). Safe to call from static init. */ +void llm_backend_register(LlmBackendFactory * factory); + +/* Register every backend compiled into THIS build (gated by the -DELIZA_ENABLE_* + * CMake options). Idempotent; call once at first `_open`. Defined in + * llm-backend-selector.cpp; the gated backends self-register via their headers. */ +void llm_backend_register_builtins(); + +/* Pick a backend for the bundle at `bundle_dir` with `cfg`. Resolution order: + * + * 1. ELIZA_LLM_BACKEND env (exact, case-insensitive backend name) — a HARD + * select. "llama.cpp" / "llamacpp" forces the in-tree path (returns + * nullptr, no error). Any other name that is not registered+available, or + * cannot serve the bundle, is a hard error: returns nullptr AND sets + * `*out_error` so the FFI aborts rather than silently using llama.cpp. + * + * 2. No env override: among registered backends that are available() AND + * can_serve(bundle_dir), pick the highest preference_rank(). If none + * qualifies, return nullptr (use the in-tree llama.cpp path). + * + * A nullptr return with `*out_error == nullptr` means "use the in-tree llama.cpp + * path" — NOT an error. A nullptr return with `*out_error != nullptr` is a hard + * failure the caller must propagate. + */ +LlmBackendFactory * llm_backend_select(const char * bundle_dir, + const eliza_llm_stream_config_t * cfg, + char ** out_error);