From 57fbe53ec564e8701b684148ae54442042a23303 Mon Sep 17 00:00:00 2001 From: abetlen Date: Wed, 25 Mar 2026 14:09:34 -0700 Subject: [PATCH 1/2] Fix embedding models without KV memory --- llama_cpp/_internals.py | 4 +++- tests/test_llama.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 6862135aa..9e9bcd407 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -288,7 +288,9 @@ def pooling_type(self) -> int: return llama_cpp.llama_pooling_type(self.ctx) def kv_cache_clear(self): - assert self.memory is not None, "Memory is not initialized" + # Embedding models with non-causal attention may not allocate memory. + if self.memory is None: + return llama_cpp.llama_memory_clear(self.memory, True) def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int) -> bool: diff --git a/tests/test_llama.py b/tests/test_llama.py index 1a70c74d4..23928fff6 100644 --- a/tests/test_llama.py +++ b/tests/test_llama.py @@ -64,6 +64,14 @@ def llama_cpp_model_path(): return model_path +@pytest.fixture +def llama_cpp_embedding_model_path(): + repo_id = "CompendiumLabs/bge-small-en-v1.5-gguf" + filename = "bge-small-en-v1.5-q4_k_m.gguf" + model_path = hf_hub_download(repo_id, filename) + return model_path + + def test_real_model(llama_cpp_model_path): import os @@ -225,9 +233,9 @@ def logit_processor_func(input_ids, logits): assert number_1 == number_3 -def test_real_llama_embeddings(llama_cpp_model_path): +def test_real_llama_embeddings(llama_cpp_embedding_model_path): model = llama_cpp.Llama( - llama_cpp_model_path, + llama_cpp_embedding_model_path, n_ctx=32, n_batch=32, n_ubatch=32, @@ -237,5 +245,5 @@ def test_real_llama_embeddings(llama_cpp_model_path): flash_attn=True, embedding=True, ) - # Smoke test for now - model.embed("Hello World") + embedding = model.embed("Hello World") + assert len(embedding) > 0 From 96550fb8f37f6efe6789321b8320053eea5a14ad Mon Sep 17 00:00:00 2001 From: abetlen Date: Wed, 25 Mar 2026 15:03:51 -0700 Subject: [PATCH 2/2] Add changelog entry for embedding memory fix --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4a0b55d3..d2e4937c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- fix: Handle embedding models without KV memory and test embeddings with a real GGUF embedding model by @abetlen in #2160 - fix(ci): Shrink CUDA wheel fatbins so CUDA releases stay under GitHub's asset size limit by @abetlen in #2158 ## [0.3.18]