Enable FlashAttention and remove SeqMax param

krisbiradar · krisbiradar · commit 43558099b8f5 · 2025-11-01T22:48:29.000+05:30
FlashAttention is now enabled by default in model parameter initialization for embedding and text generation. The unused SeqMax parameter has been removed from unit tests to simplify configuration. Minor formatting improvements were made in IContextParamsExtensions and NativeApi for consistency.
diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -40,6 +40,7 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
+                FlashAttention = true,
                 UseMemorymap = true,
                 PoolingType = LLamaPoolingType.Mean,
             };
@@ -67,6 +68,7 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights we
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
+                FlashAttention = true,
                 UseMemorymap = true,
                 PoolingType = LLamaPoolingType.Mean,
             };
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -38,6 +38,7 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
+                FlashAttention = true,
                 UseMemorymap = true
             };
             _weights = LLamaWeights.LoadFromFile(@params);
@@ -65,6 +66,7 @@ public LlamaSharpTextGenerator(LLamaWeights weights, LLamaSharpConfig config, St
                 SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
                 BatchSize = 512,
                 UBatchSize = 512,
+                FlashAttention = true,
                 UseMemorymap = true
             };
             _executor = executor ?? new StatelessExecutor(_weights, @params);
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
@@ -16,7 +16,6 @@ public LLamaContextTests()
                 ContextSize = 512,
                 BatchSize = 8,
                 UBatchSize = 8,
-                SeqMax = 1,
                 VocabOnly = false,
                 GpuLayerCount = Constants.CIGpuLayerCount,
             };
diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs
@@ -18,7 +18,6 @@ public LLamaRerankerTests(ITestOutputHelper testOutputHelper)
         var @params = new ModelParams(Constants.RerankingModelPath)
         {
             ContextSize = 0,
-            SeqMax = 1,
             PoolingType = LLamaPoolingType.Rank,
             GpuLayerCount = Constants.CIGpuLayerCount,
         };
diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
@@ -25,7 +25,6 @@ public SamplingTests(ITestOutputHelper testOutputHelper)
             _params = new ModelParams(Constants.GenerativeModelPath2) {
                 ContextSize = 200,
                 BatchSize = 200,
-                SeqMax = 4,
                 GpuLayerCount = Constants.CIGpuLayerCount,
             };
             _model = LLamaWeights.LoadFromFile(_params);
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -37,7 +37,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.yarn_beta_slow = @params.YarnBetaSlow ?? 1f;
             result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
             result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.Unspecified;
-
+            
             result.defrag_threshold = @params.DefragThreshold ?? -1;
 
             result.cb_eval = IntPtr.Zero;
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -175,15 +175,12 @@ public static void llama_empty_call()
         /// <param name="buf">A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)</param>
         /// <param name="length">The size of the allocated buffer</param>
         /// <returns>The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.</returns>
-        public static unsafe int llama_chat_apply_template(byte* tmpl, LLamaChatMessage* chat, nuint n_msg,
-            [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length)
+        public static unsafe int llama_chat_apply_template(byte* tmpl, LLamaChatMessage* chat, nuint n_msg, [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length)
         {
             return internal_llama_chat_apply_template(tmpl, chat, n_msg, add_ass, buf, length);
 
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl,
-                EntryPoint = "llama_chat_apply_template")]
-            static extern int internal_llama_chat_apply_template(byte* tmpl, LLamaChatMessage* chat, nuint n_msg,
-                [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length);
+            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl,EntryPoint = "llama_chat_apply_template")]
+            static extern int internal_llama_chat_apply_template(byte* tmpl, LLamaChatMessage* chat, nuint n_msg, [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length);
         }
 
         /// <summary>

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,6 @@ public LLamaRerankerTests(ITestOutputHelper testOutputHelper)`
`18`	`18`	`var @params = new ModelParams(Constants.RerankingModelPath)`
`19`	`19`	`{`
`20`	`20`	`ContextSize = 0,`
`21`		`- SeqMax = 1,`
`22`	`21`	`PoolingType = LLamaPoolingType.Rank,`
`23`	`22`	`GpuLayerCount = Constants.CIGpuLayerCount,`
`24`	`23`	`};`