From 09ffcb80d649d1ecd7289ed9aeddcb185130b4cc Mon Sep 17 00:00:00 2001 From: Victor Biederbeck Date: Sat, 14 Mar 2026 11:17:46 -0700 Subject: [PATCH 1/3] feat: expose attention_type parameter in Llama.__init__ --- llama_cpp/llama.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 88bc2e5bb..6f22b94fa 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -81,6 +81,7 @@ def __init__( int ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED, + attention_type: int = llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED, rope_freq_base: float = 0.0, rope_freq_scale: float = 0.0, yarn_ext_factor: float = -1.0, @@ -319,6 +320,7 @@ def __init__( else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED ) self.context_params.pooling_type = pooling_type + self.context_params.attention_type = attention_type self.context_params.rope_freq_base = ( rope_freq_base if rope_freq_base != 0.0 else 0 ) From 5478014e80a28c8ef7af91528e6cba14af08012b Mon Sep 17 00:00:00 2001 From: abetlen Date: Tue, 24 Mar 2026 02:44:25 -0700 Subject: [PATCH 2/3] docs: preserve attention_type in pickled state --- llama_cpp/llama.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6f22b94fa..6a81fe999 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -164,6 +164,7 @@ def __init__( n_threads_batch: Number of threads to use for batch processing rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054 pooling_type: Pooling type, from `enum llama_pooling_type`. + attention_type: Attention type, from `enum llama_attention_type`. rope_freq_base: RoPE base frequency, 0 = from model rope_freq_scale: RoPE frequency scaling factor, 0 = from model yarn_ext_factor: YaRN extrapolation mix factor, negative = from model @@ -2102,6 +2103,7 @@ def __getstate__(self): n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, pooling_type=self.context_params.pooling_type, + attention_type=self.context_params.attention_type, rope_freq_base=self.context_params.rope_freq_base, rope_freq_scale=self.context_params.rope_freq_scale, yarn_ext_factor=self.context_params.yarn_ext_factor, @@ -2141,6 +2143,9 @@ def __getstate__(self): ) def __setstate__(self, state): + state.setdefault( + "attention_type", llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED + ) self.__init__(**state) def save_state(self) -> LlamaState: From 2da411b47ab3a77dda9c2f3a9b6ab4fe29c1e5ce Mon Sep 17 00:00:00 2001 From: abetlen Date: Tue, 24 Mar 2026 02:46:39 -0700 Subject: [PATCH 3/3] docs: update changelog for attention_type --- CHANGELOG.md | 1 + llama_cpp/llama.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b47613109..de4f070ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155 - fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154 diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6a81fe999..ad484c4d5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2143,9 +2143,6 @@ def __getstate__(self): ) def __setstate__(self, state): - state.setdefault( - "attention_type", llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED - ) self.__init__(**state) def save_state(self) -> LlamaState: