Skip to content

Commit 0990be3

Browse files
committed
Enable FlashAttention and clean up P/Invoke signatures
Set FlashAttention to true in BuilderExtensions for improved performance. Refactored NativeApi P/Invoke method signatures to single-line format for better readability and consistency.
1 parent ff6ea95 commit 0990be3

File tree

2 files changed

+8
-14
lines changed

2 files changed

+8
-14
lines changed

LLama.KernelMemory/BuilderExtensions.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ public static IKernelMemoryBuilder WithLLamaSharpDefaults(this IKernelMemoryBuil
7777
SplitMode = config.SplitMode,
7878
BatchSize = 512,
7979
UBatchSize = 512,
80+
FlashAttention = true,
8081
UseMemorymap = true
8182
};
8283

LLama/Native/NativeApi.cs

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,7 @@ public static void llama_empty_call()
9999
/// <returns></returns>
100100
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
101101
[return: MarshalAs(UnmanagedType.U1)]
102-
public static extern bool llama_state_load_file(SafeLLamaContextHandle ctx, string path_session,
103-
LLamaToken[] tokens_out, ulong n_token_capacity, out ulong n_token_count_out);
102+
public static extern bool llama_state_load_file(SafeLLamaContextHandle ctx, string path_session, LLamaToken[] tokens_out, ulong n_token_capacity, out ulong n_token_count_out);
104103

105104
/// <summary>
106105
/// Save session file
@@ -112,45 +111,39 @@ public static extern bool llama_state_load_file(SafeLLamaContextHandle ctx, stri
112111
/// <returns></returns>
113112
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
114113
[return: MarshalAs(UnmanagedType.U1)]
115-
public static extern bool llama_state_save_file(SafeLLamaContextHandle ctx, string path_session,
116-
LLamaToken[] tokens, ulong n_token_count);
114+
public static extern bool llama_state_save_file(SafeLLamaContextHandle ctx, string path_session, LLamaToken[] tokens, ulong n_token_count);
117115

118116
/// <summary>
119117
/// Saves the specified sequence as a file on specified filepath. Can later be loaded via <see cref="llama_state_load_file(SafeLLamaContextHandle, string, LLamaToken[], ulong, out ulong)"/>
120118
/// </summary>
121119
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
122-
public static extern unsafe nuint llama_state_seq_save_file(SafeLLamaContextHandle ctx, string filepath,
123-
LLamaSeqId seq_id, LLamaToken* tokens, nuint n_token_count);
120+
public static extern unsafe nuint llama_state_seq_save_file(SafeLLamaContextHandle ctx, string filepath, LLamaSeqId seq_id, LLamaToken* tokens, nuint n_token_count);
124121

125122
/// <summary>
126123
/// Loads a sequence saved as a file via <see cref="llama_state_save_file(SafeLLamaContextHandle, string, LLamaToken[], ulong)"/> into the specified sequence
127124
/// </summary>
128125
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
129-
public static extern unsafe nuint llama_state_seq_load_file(SafeLLamaContextHandle ctx, string filepath,
130-
LLamaSeqId dest_seq_id, LLamaToken* tokens_out, nuint n_token_capacity, out nuint n_token_count_out);
126+
public static extern unsafe nuint llama_state_seq_load_file(SafeLLamaContextHandle ctx, string filepath, LLamaSeqId dest_seq_id, LLamaToken* tokens_out, nuint n_token_capacity, out nuint n_token_count_out);
131127

132128
/// <summary>
133129
/// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens
134130
/// </summary>
135131
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
136-
public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx,
137-
[MarshalAs(UnmanagedType.U1)] bool causalAttn);
132+
public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn);
138133

139134
/// <summary>
140135
/// Set whether the context outputs embeddings or not
141136
/// </summary>
142137
/// <param name="ctx"></param>
143138
/// <param name="embeddings">If true, embeddings will be returned but logits will not</param>
144139
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
145-
public static extern void llama_set_embeddings(SafeLLamaContextHandle ctx,
146-
[MarshalAs(UnmanagedType.U1)] bool embeddings);
140+
public static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings);
147141

148142
/// <summary>
149143
/// Set abort callback
150144
/// </summary>
151145
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
152-
public static extern void llama_set_abort_callback(SafeLlamaModelHandle ctx,
153-
IntPtr /* ggml_abort_callback */ abortCallback, IntPtr abortCallbackData);
146+
public static extern void llama_set_abort_callback(SafeLlamaModelHandle ctx, IntPtr /* ggml_abort_callback */ abortCallback, IntPtr abortCallbackData);
154147

155148
/// <summary>
156149
/// Get the n_seq_max for this context

0 commit comments

Comments
 (0)