Fix Reranker and Sampling Test Failures

krisbiradar · krisbiradar · commit ff6ea954dde4 · 2025-10-14T02:25:22.000+05:30
diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs
@@ -18,9 +18,9 @@ public LLamaRerankerTests(ITestOutputHelper testOutputHelper)
         var @params = new ModelParams(Constants.RerankingModelPath)
         {
             ContextSize = 0,
+            SeqMax = 1,
             PoolingType = LLamaPoolingType.Rank,
             GpuLayerCount = Constants.CIGpuLayerCount,
-
         };
         using var weights = LLamaWeights.LoadFromFile(@params);
         _reranker = new LLamaReranker(weights, @params);
diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
@@ -25,6 +25,7 @@ public SamplingTests(ITestOutputHelper testOutputHelper)
             _params = new ModelParams(Constants.GenerativeModelPath2) {
                 ContextSize = 200,
                 BatchSize = 200,
+                SeqMax = 4,
                 GpuLayerCount = Constants.CIGpuLayerCount,
             };
             _model = LLamaWeights.LoadFromFile(_params);
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -102,7 +102,7 @@ public class ModelOptions
         public bool NoKqvOffload { get; set; }
 
         /// <inheritdoc />
-        public bool FlashAttention { get; set; }
+        public bool? FlashAttention { get; set; }
 
         /// <inheritdoc />
         public Encoding Encoding { get; set; } = Encoding.UTF8;
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -103,6 +103,11 @@ public interface IContextParams
     /// </summary>
     bool NoKqvOffload { get; }
 
+    /// <summary>
+    /// Whether to use flash attention
+    /// </summary>
+    bool? FlashAttention { get; }
+    
     /// <summary>
     /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
     /// </summary>
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -96,6 +96,9 @@ public record ModelParams
 
         /// <inheritdoc />
         public bool NoKqvOffload { get; set; }
+        
+        /// <inheritdoc />
+        public bool? FlashAttention { get; set; }
 
         /// <inheritdoc />
         [Obsolete]
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -51,6 +51,13 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.offload_kqv = !@params.NoKqvOffload;
             result.llama_pooling_type = @params.PoolingType;
             result.attention_type = @params.AttentionType;
+            result.llama_flash_attn_type = @params.FlashAttention switch
+            {
+                true => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_ENABLED,
+                false => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_DISABLED,
+                null => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_AUTO
+            };
+            result.kv_unified = true;
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);