From 09ffcb80d649d1ecd7289ed9aeddcb185130b4cc Mon Sep 17 00:00:00 2001
From: Victor Biederbeck <victor@moria.hiddencove.xyz>
Date: Sat, 14 Mar 2026 11:17:46 -0700
Subject: [PATCH 1/3] feat: expose attention_type parameter in Llama.__init__

---
 llama_cpp/llama.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 88bc2e5bb..6f22b94fa 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -81,6 +81,7 @@ def __init__(
             int
         ] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
+        attention_type: int = llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED,
         rope_freq_base: float = 0.0,
         rope_freq_scale: float = 0.0,
         yarn_ext_factor: float = -1.0,
@@ -319,6 +320,7 @@ def __init__(
             else llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
         )
         self.context_params.pooling_type = pooling_type
+        self.context_params.attention_type = attention_type
         self.context_params.rope_freq_base = (
             rope_freq_base if rope_freq_base != 0.0 else 0
         )

From 5478014e80a28c8ef7af91528e6cba14af08012b Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Tue, 24 Mar 2026 02:44:25 -0700
Subject: [PATCH 2/3] docs: preserve attention_type in pickled state

---
 llama_cpp/llama.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6f22b94fa..6a81fe999 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -164,6 +164,7 @@ def __init__(
             n_threads_batch: Number of threads to use for batch processing
             rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054
             pooling_type: Pooling type, from `enum llama_pooling_type`.
+            attention_type: Attention type, from `enum llama_attention_type`.
             rope_freq_base: RoPE base frequency, 0 = from model
             rope_freq_scale: RoPE frequency scaling factor, 0 = from model
             yarn_ext_factor: YaRN extrapolation mix factor, negative = from model
@@ -2102,6 +2103,7 @@ def __getstate__(self):
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
             pooling_type=self.context_params.pooling_type,
+            attention_type=self.context_params.attention_type,
             rope_freq_base=self.context_params.rope_freq_base,
             rope_freq_scale=self.context_params.rope_freq_scale,
             yarn_ext_factor=self.context_params.yarn_ext_factor,
@@ -2141,6 +2143,9 @@ def __getstate__(self):
         )
 
     def __setstate__(self, state):
+        state.setdefault(
+            "attention_type", llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED
+        )
         self.__init__(**state)
 
     def save_state(self) -> LlamaState:

From 2da411b47ab3a77dda9c2f3a9b6ab4fe29c1e5ce Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Tue, 24 Mar 2026 02:46:39 -0700
Subject: [PATCH 3/3] docs: update changelog for attention_type

---
 CHANGELOG.md       | 1 +
 llama_cpp/llama.py | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b47613109..de4f070ff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: Expose `attention_type` in `Llama.__init__` for non-causal embedding models by @jamesbiederbeck in #2143
 - fix(ci): Build Docker images from the checked-out source and sanitize branch tags by @abetlen in #2156
 - fix(ci): Fix the CUDA wheel workflow and keep release tags aligned with the built toolkit by @abetlen in #2155
 - fix(ci): Speed up release wheel builds by moving arm64 off QEMU and parallelizing riscv64 by @abetlen in #2154
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6a81fe999..ad484c4d5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -2143,9 +2143,6 @@ def __getstate__(self):
         )
 
     def __setstate__(self, state):
-        state.setdefault(
-            "attention_type", llama_cpp.LLAMA_ATTENTION_TYPE_UNSPECIFIED
-        )
         self.__init__(**state)
 
     def save_state(self) -> LlamaState: