diff --git a/tabby/Models/LlamaRuntimeModels.swift b/tabby/Models/LlamaRuntimeModels.swift
index 3553d6e..0272f00 100644
--- a/tabby/Models/LlamaRuntimeModels.swift
+++ b/tabby/Models/LlamaRuntimeModels.swift
@@ -188,6 +188,12 @@ enum LlamaRuntimeError: LocalizedError {
     case unavailable(String)
     case cancelled
     case generationFailed(String)
+    /// Signals that the first-token confidence gate aborted generation because the model's top-1
+    /// raw-logit softmax probability at position 0 was below the configured threshold.
+    /// The engine layer treats this as a normal "no suggestion" outcome rather than a user-facing
+    /// failure. Carrying probability/threshold/token in the case lets diagnostics distinguish a
+    /// suppressed-by-confidence empty from any other empty result.
+    case lowConfidenceSuppression(probability: Float, threshold: Double, token: String)
 
     var errorDescription: String? {
         switch self {
@@ -195,6 +201,8 @@ enum LlamaRuntimeError: LocalizedError {
             return message
         case .cancelled:
             return "Runtime work was cancelled."
+        case let .lowConfidenceSuppression(probability, threshold, _):
+            return "Suggestion suppressed: first-token confidence \(probability) is below threshold \(threshold)."
         }
     }
 }
diff --git a/tabby/Models/SuggestionEngineModels.swift b/tabby/Models/SuggestionEngineModels.swift
index f4c84e2..df5f322 100644
--- a/tabby/Models/SuggestionEngineModels.swift
+++ b/tabby/Models/SuggestionEngineModels.swift
@@ -83,4 +83,13 @@ struct SuggestionSettingsSnapshot: Equatable, Sendable {
     /// on the first generated token, preventing conversational openers from appearing in
     /// inline autocomplete suggestions.
     let isFirstTokenGatingEnabled: Bool
+    /// When true, the llama runtime measures top-1 probability of the raw-logit softmax at
+    /// position 0 and silently suppresses the suggestion if it falls below
+    /// `firstTokenConfidenceThreshold`. Distinct from gating: gating *masks* specific tokens,
+    /// confidence suppression *aborts* the whole suggestion when the model is uncertain.
+    let isFirstTokenConfidenceGatingEnabled: Bool
+    /// Probability threshold in [0, 1]. The suggestion is suppressed when the model's top-1
+    /// raw-logit softmax probability at position 0 is below this value. 0 disables in practice
+    /// (any probability >= 0 passes).
+    let firstTokenConfidenceThreshold: Double
 }
diff --git a/tabby/Models/SuggestionModels.swift b/tabby/Models/SuggestionModels.swift
index 5e5b508..74083e7 100644
--- a/tabby/Models/SuggestionModels.swift
+++ b/tabby/Models/SuggestionModels.swift
@@ -257,6 +257,12 @@ struct SuggestionRequest: Equatable, Sendable {
     /// on the very first sampled token. This is a llama-only feature — Apple Intelligence does
     /// not expose logit-level control.
     let isFirstTokenGatingEnabled: Bool
+    /// When true, the llama runtime measures the top-1 raw-logit softmax probability at the
+    /// first token position and aborts (returns no suggestion) when it falls below
+    /// `firstTokenConfidenceThreshold`. llama-only.
+    let isFirstTokenConfidenceGatingEnabled: Bool
+    /// Probability threshold in [0, 1] used by `isFirstTokenConfidenceGatingEnabled`.
+    let firstTokenConfidenceThreshold: Double
 }
 
 /// The engine's normalized response, including raw model text for debugging.
diff --git a/tabby/Models/SuggestionSettingsModel.swift b/tabby/Models/SuggestionSettingsModel.swift
index dfde988..e83a708 100644
--- a/tabby/Models/SuggestionSettingsModel.swift
+++ b/tabby/Models/SuggestionSettingsModel.swift
@@ -24,6 +24,15 @@ final class SuggestionSettingsModel: ObservableObject {
     /// This prevents instruction-tuned models from starting suggestions with conversational
     /// openers that belong in a chat reply, not in inline autocomplete.
     @Published private(set) var isFirstTokenGatingEnabled: Bool
+    /// When enabled, the llama runtime measures the top-1 raw-logit softmax probability of the
+    /// first sampled token and silently suppresses the whole suggestion if it falls below
+    /// `firstTokenConfidenceThreshold`. This is a *separate* axis from chat-opener gating:
+    /// gating masks specific tokens; confidence suppression aborts generation entirely when
+    /// the model's own distribution is too flat to produce a trustworthy continuation.
+    @Published private(set) var isFirstTokenConfidenceGatingEnabled: Bool
+    /// Probability threshold in [0, 1]. Higher values are stricter (more suggestions are
+    /// suppressed). 0 effectively disables the gate even when the toggle is on.
+    @Published private(set) var firstTokenConfidenceThreshold: Double
 
     private let userDefaults: UserDefaults
 
@@ -38,6 +47,14 @@ final class SuggestionSettingsModel: ObservableObject {
     private static let selectedLocalPromptModeDefaultsKey = "selectedLocalSuggestionPromptMode"
     private static let customAIInstructionsDefaultsKey = "tabbyCustomAIInstructions"
     private static let isFirstTokenGatingEnabledDefaultsKey = "tabbyFirstTokenGatingEnabled"
+    private static let confidenceGatingEnabledDefaultsKey = "tabbyFirstTokenConfidenceGatingEnabled"
+    private static let confidenceThresholdDefaultsKey = "tabbyFirstTokenConfidenceThreshold"
+
+    /// 0.10 is a deliberately gentle starting point: our local models often peak at ~0.30-0.60
+    /// for unambiguous continuations, so this threshold catches only the genuinely-confused
+    /// cases (e.g. the model sees the prompt as ambiguous and spreads probability widely).
+    /// We expect to tune this once telemetry from the `first-token-confidence` log accumulates.
+    private static let defaultFirstTokenConfidenceThreshold: Double = 0.10
 
     init(
         configuration: SuggestionConfiguration,
@@ -74,6 +91,18 @@ final class SuggestionSettingsModel: ObservableObject {
         }
         // Default to enabled — first-token gating is a net positive for all known instruct models.
         let resolvedFirstTokenGatingEnabled = userDefaults.object(forKey: Self.isFirstTokenGatingEnabledDefaultsKey) as? Bool ?? true
+        // Default off until we've seen field telemetry. The deny-list gate ships on by default
+        // because it's evidence-backed and surgical; confidence suppression is heuristic and can
+        // hide useful suggestions when the threshold is mistuned, so users opt in explicitly.
+        let resolvedConfidenceGatingEnabled = userDefaults
+            .object(forKey: Self.confidenceGatingEnabledDefaultsKey) as? Bool ?? false
+        let resolvedConfidenceThreshold: Double = {
+            guard userDefaults.object(forKey: Self.confidenceThresholdDefaultsKey) != nil else {
+                return Self.defaultFirstTokenConfidenceThreshold
+            }
+            let raw = userDefaults.double(forKey: Self.confidenceThresholdDefaultsKey)
+            return min(max(raw, 0.0), 1.0)
+        }()
 
         isGloballyEnabled = resolvedGloballyEnabled
         disabledAppRules = resolvedDisabledAppRules
@@ -84,6 +113,8 @@ final class SuggestionSettingsModel: ObservableObject {
         selectedLocalPromptMode = resolvedLocalPromptMode
         customAIInstructions = resolvedCustomAIInstructions
         isFirstTokenGatingEnabled = resolvedFirstTokenGatingEnabled
+        isFirstTokenConfidenceGatingEnabled = resolvedConfidenceGatingEnabled
+        firstTokenConfidenceThreshold = resolvedConfidenceThreshold
 
         userDefaults.set(resolvedGloballyEnabled, forKey: Self.isGloballyEnabledDefaultsKey)
         persistDisabledAppRules(resolvedDisabledAppRules)
@@ -94,6 +125,8 @@ final class SuggestionSettingsModel: ObservableObject {
         persistSelectedLocalPromptMode(resolvedLocalPromptMode)
         persistCustomAIInstructions(resolvedCustomAIInstructions)
         userDefaults.set(resolvedFirstTokenGatingEnabled, forKey: Self.isFirstTokenGatingEnabledDefaultsKey)
+        userDefaults.set(resolvedConfidenceGatingEnabled, forKey: Self.confidenceGatingEnabledDefaultsKey)
+        userDefaults.set(resolvedConfidenceThreshold, forKey: Self.confidenceThresholdDefaultsKey)
     }
 
     /// Compatibility shim for legacy call sites while the UI migrates from the old toggle to the
@@ -121,7 +154,9 @@ final class SuggestionSettingsModel: ObservableObject {
             selectedWordCountPreset: selectedWordCountPreset,
             effectivePromptMode: effectivePromptMode,
             customAIInstructions: CustomAIInstructionFormatter.normalized(customAIInstructions),
-            isFirstTokenGatingEnabled: isFirstTokenGatingEnabled
+            isFirstTokenGatingEnabled: isFirstTokenGatingEnabled,
+            isFirstTokenConfidenceGatingEnabled: isFirstTokenConfidenceGatingEnabled,
+            firstTokenConfidenceThreshold: firstTokenConfidenceThreshold
         )
     }
 
@@ -282,6 +317,27 @@ final class SuggestionSettingsModel: ObservableObject {
         userDefaults.set(enabled, forKey: Self.isFirstTokenGatingEnabledDefaultsKey)
     }
 
+    func setFirstTokenConfidenceGatingEnabled(_ enabled: Bool) {
+        guard isFirstTokenConfidenceGatingEnabled != enabled else {
+            return
+        }
+
+        isFirstTokenConfidenceGatingEnabled = enabled
+        userDefaults.set(enabled, forKey: Self.confidenceGatingEnabledDefaultsKey)
+    }
+
+    func setFirstTokenConfidenceThreshold(_ threshold: Double) {
+        // Clamp at the setter boundary so any UI bug (slider out of range, manual defaults edit)
+        // cannot corrupt persisted state. The runtime layer trusts this value as already-valid.
+        let clamped = min(max(threshold, 0.0), 1.0)
+        guard firstTokenConfidenceThreshold != clamped else {
+            return
+        }
+
+        firstTokenConfidenceThreshold = clamped
+        userDefaults.set(clamped, forKey: Self.confidenceThresholdDefaultsKey)
+    }
+
     private static func effectivePromptMode(
         engine: SuggestionEngineKind,
         localPromptMode: SuggestionPromptMode
@@ -423,7 +479,12 @@ final class SuggestionSettingsModel: ObservableObject {
 
 extension SuggestionSettingsModel: SuggestionSettingsProviding {
     var snapshotPublisher: AnyPublisher<SuggestionSettingsSnapshot, Never> {
-        Publishers.CombineLatest4(
+        // Combine maxes out at four upstreams per operator, but the snapshot now depends on nine
+        // published values. We split them into two logical bundles — a "core" group of always-on
+        // selections and a "first-token" group of llama-only gating settings — then CombineLatest
+        // those two intermediate publishers. Equality on the bundles via removeDuplicates makes
+        // the downstream snapshot still emit only on real change.
+        let coreSelections = Publishers.CombineLatest4(
             Publishers.CombineLatest4(
                 $isGloballyEnabled,
                 $disabledAppRules,
@@ -434,22 +495,33 @@ extension SuggestionSettingsModel: SuggestionSettingsProviding {
             $customAIInstructions,
             $isFirstTokenGatingEnabled
         )
-        .map { combinedSettings, localPromptMode, customAIInstructions, firstTokenGatingEnabled in
-            let (globallyEnabled, disabledAppRules, engine, wordCountPreset) = combinedSettings
-            return SuggestionSettingsSnapshot(
-                isGloballyEnabled: globallyEnabled,
-                disabledAppBundleIdentifiers: Set(disabledAppRules.map(\.bundleIdentifier)),
-                selectedEngine: engine,
-                selectedWordCountPreset: wordCountPreset,
-                effectivePromptMode: Self.effectivePromptMode(
-                    engine: engine,
-                    localPromptMode: localPromptMode
-                ),
-                customAIInstructions: CustomAIInstructionFormatter.normalized(customAIInstructions),
-                isFirstTokenGatingEnabled: firstTokenGatingEnabled
-            )
-        }
-        .removeDuplicates()
-        .eraseToAnyPublisher()
+
+        let confidenceSelections = Publishers.CombineLatest(
+            $isFirstTokenConfidenceGatingEnabled,
+            $firstTokenConfidenceThreshold
+        )
+
+        return Publishers.CombineLatest(coreSelections, confidenceSelections)
+            .map { coreTuple, confidenceTuple in
+                let (combinedSettings, localPromptMode, customAIInstructions, firstTokenGatingEnabled) = coreTuple
+                let (globallyEnabled, disabledAppRules, engine, wordCountPreset) = combinedSettings
+                let (confidenceGatingEnabled, confidenceThreshold) = confidenceTuple
+                return SuggestionSettingsSnapshot(
+                    isGloballyEnabled: globallyEnabled,
+                    disabledAppBundleIdentifiers: Set(disabledAppRules.map(\.bundleIdentifier)),
+                    selectedEngine: engine,
+                    selectedWordCountPreset: wordCountPreset,
+                    effectivePromptMode: Self.effectivePromptMode(
+                        engine: engine,
+                        localPromptMode: localPromptMode
+                    ),
+                    customAIInstructions: CustomAIInstructionFormatter.normalized(customAIInstructions),
+                    isFirstTokenGatingEnabled: firstTokenGatingEnabled,
+                    isFirstTokenConfidenceGatingEnabled: confidenceGatingEnabled,
+                    firstTokenConfidenceThreshold: confidenceThreshold
+                )
+            }
+            .removeDuplicates()
+            .eraseToAnyPublisher()
     }
 }
diff --git a/tabby/Services/Runtime/LlamaRuntimeCore.swift b/tabby/Services/Runtime/LlamaRuntimeCore.swift
index ed0107f..8783a1f 100644
--- a/tabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/tabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -41,6 +41,16 @@ actor LlamaRuntimeCore {
         category: "first-token-gate"
     )
 
+    /// Filterable signal for first-token confidence-based suppression. Distinct category from the
+    /// gate logger because these are *separate signals*: gating masks specific tokens; confidence
+    /// suppression aborts the whole suggestion when the model's distribution at position 0 is too
+    /// flat. A single generation can fire neither, one, or both.
+    ///   log stream --predicate 'subsystem == "app.tabby" AND category == "first-token-confidence"'
+    private static let firstTokenConfidenceLogger = Logger(
+        subsystem: "app.tabby",
+        category: "first-token-confidence"
+    )
+
     private var backendInitialized = false
     private var model: OpaquePointer?
     private var preparedRuntime: PreparedLlamaRuntime?
@@ -162,7 +172,9 @@ actor LlamaRuntimeCore {
         minP: Double,
         repetitionPenalty: Double,
         seed: UInt32? = nil,
-        firstTokenGatingEnabled: Bool = true
+        firstTokenGatingEnabled: Bool = true,
+        firstTokenConfidenceGatingEnabled: Bool = false,
+        firstTokenConfidenceThreshold: Double = 0.0
     ) throws -> String {
         guard let preparedRuntime else {
             throw LlamaRuntimeError.unavailable("The llama model is not loaded.")
@@ -257,6 +269,21 @@ actor LlamaRuntimeCore {
                     logFirstTokenGateFireIfNeeded(context: context, vocab: vocab)
                 }
 
+                // Confidence gating runs *before* sampling so we can abort the whole generation
+                // (and avoid burning a sampled token + decode) when the model's distribution at
+                // position 0 is too flat. The signal is the top-1 probability of the softmax over
+                // the raw logits — not the post-sampler distribution — because temperature/top-p
+                // shape the sampler's output, not the model's actual confidence.
+                if tokenIndex == 0 && firstTokenConfidenceGatingEnabled {
+                    if let suppression = lowConfidenceSuppressionIfNeeded(
+                        context: context,
+                        vocab: vocab,
+                        threshold: firstTokenConfidenceThreshold
+                    ) {
+                        throw suppression
+                    }
+                }
+
                 let nextToken = llama_sampler_sample(activeSampler, context, -1)
                 if nextToken == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, nextToken) {
                     break
@@ -282,6 +309,14 @@ actor LlamaRuntimeCore {
                 try decodeToken(nextToken, position: position, in: context)
                 position += 1
             }
+        } catch let error as LlamaRuntimeError {
+            // Confidence suppression is a clean abort — the prompt KV is still valid and the next
+            // request can reuse it. Reserve the cache reset for genuine generation failures.
+            if case .lowConfidenceSuppression = error {
+                throw error
+            }
+            shouldResetPromptCache = true
+            throw error
         } catch {
             shouldResetPromptCache = true
             throw error
@@ -805,6 +840,73 @@ actor LlamaRuntimeCore {
         )
     }
 
+    /// Checks the model's confidence at position 0 and returns a suppression error if it is too
+    /// low. Confidence is defined as the **top-1 probability of the softmax over the raw logits**
+    /// at the last context position — i.e. how peaked the model's actual distribution is, before
+    /// any sampler-chain transforms.
+    ///
+    /// We deliberately don't use the *sampled* token's post-transform probability: temperature,
+    /// top-p, and min-p reshape the distribution, so a sampled-token probability of 0.9 after
+    /// top-p can correspond to a raw distribution where the true top-1 was 0.05 (the model was
+    /// confused, but the sampler concentrated mass on a survivor). For inline autocomplete we
+    /// want to suppress when *the model itself* was uncertain, not when the sampler happened to
+    /// be confident about a leftover.
+    ///
+    /// Implementation note: we compute softmax in a numerically-stable way (subtract max logit
+    /// before exp) over the full vocabulary. This is one O(nVocab) pass — same cost as the gate
+    /// argmax — and it only runs once per generation when confidence gating is enabled.
+    private func lowConfidenceSuppressionIfNeeded(
+        context: OpaquePointer,
+        vocab: OpaquePointer,
+        threshold: Double
+    ) -> LlamaRuntimeError? {
+        guard let logits = llama_get_logits_ith(context, -1) else {
+            return nil
+        }
+
+        let nVocab = Int(llama_vocab_n_tokens(vocab))
+        guard nVocab > 0 else { return nil }
+
+        var maxLogit: Float = -.infinity
+        var argmaxTokenID: llama_token = 0
+        for tokenID in 0 ..< nVocab {
+            let value = logits[tokenID]
+            if value > maxLogit {
+                maxLogit = value
+                argmaxTokenID = llama_token(tokenID)
+            }
+        }
+
+        // Numerically-stable softmax: subtract the max before exponentiating so we don't overflow
+        // float on large logits. The probability of the argmax token is then
+        //   exp(0) / sum(exp(logit_i - max)) = 1 / sum(exp(logit_i - max))
+        var expSum: Double = 0
+        for tokenID in 0 ..< nVocab {
+            expSum += Double(exp(logits[tokenID] - maxLogit))
+        }
+        guard expSum > 0 else { return nil }
+
+        let topProbability = Float(1.0 / expSum)
+
+        guard Double(topProbability) < threshold else {
+            return nil
+        }
+
+        let piece = pieceString(for: argmaxTokenID, vocab: vocab)
+        let escaped = piece
+            .replacingOccurrences(of: "\n", with: "\\n")
+            .replacingOccurrences(of: "\t", with: "\\t")
+        Self.firstTokenConfidenceLogger.debug(
+            "suppressed: top-1 token \(argmaxTokenID, privacy: .public) (\"\(escaped, privacy: .public)\") prob=\(topProbability, privacy: .public) threshold=\(threshold, privacy: .public)"
+        )
+
+        return .lowConfidenceSuppression(
+            probability: topProbability,
+            threshold: threshold,
+            token: piece
+        )
+    }
+
     /// Tokenizes a short string and returns just the first token, without BOS.
     /// Returns nil if tokenization fails or produces no tokens.
     private func tokenizeFirstToken(_ text: String, vocab: OpaquePointer) -> llama_token? {
diff --git a/tabby/Services/Runtime/LlamaRuntimeManager.swift b/tabby/Services/Runtime/LlamaRuntimeManager.swift
index 266dcfa..2394377 100644
--- a/tabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/tabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -94,7 +94,9 @@ final class LlamaRuntimeManager: ObservableObject {
         minP: Double,
         repetitionPenalty: Double,
         seed: UInt32? = nil,
-        firstTokenGatingEnabled: Bool = true
+        firstTokenGatingEnabled: Bool = true,
+        firstTokenConfidenceGatingEnabled: Bool = false,
+        firstTokenConfidenceThreshold: Double = 0.0
     ) async throws -> String {
         _ = try await preparedRuntime()
 
@@ -109,7 +111,9 @@ final class LlamaRuntimeManager: ObservableObject {
                 minP: minP,
                 repetitionPenalty: repetitionPenalty,
                 seed: seed,
-                firstTokenGatingEnabled: firstTokenGatingEnabled
+                firstTokenGatingEnabled: firstTokenGatingEnabled,
+                firstTokenConfidenceGatingEnabled: firstTokenConfidenceGatingEnabled,
+                firstTokenConfidenceThreshold: firstTokenConfidenceThreshold
             )
         } catch is CancellationError {
             throw LlamaRuntimeError.cancelled
diff --git a/tabby/Services/Runtime/LlamaSuggestionEngine.swift b/tabby/Services/Runtime/LlamaSuggestionEngine.swift
index 1c71a4e..d07b426 100644
--- a/tabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/tabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -21,18 +21,38 @@ final class LlamaSuggestionEngine {
         do {
             let startTime = Date()
             let cachedPrefixBytes = promptCacheHintTracker.cachedPrefixBytes(for: request)
-            let rawSuggestion = try await runtimeManager.generate(
-                prompt: request.prompt,
-                cachedPrefixBytes: cachedPrefixBytes,
-                maxPredictionTokens: request.maxPredictionTokens,
-                temperature: request.temperature,
-                topK: request.topK,
-                topP: request.topP,
-                minP: request.minP,
-                repetitionPenalty: request.repetitionPenalty,
-                seed: request.randomSeed,
-                firstTokenGatingEnabled: request.isFirstTokenGatingEnabled
-            )
+            let rawSuggestion: String
+            do {
+                rawSuggestion = try await runtimeManager.generate(
+                    prompt: request.prompt,
+                    cachedPrefixBytes: cachedPrefixBytes,
+                    maxPredictionTokens: request.maxPredictionTokens,
+                    temperature: request.temperature,
+                    topK: request.topK,
+                    topP: request.topP,
+                    minP: request.minP,
+                    repetitionPenalty: request.repetitionPenalty,
+                    seed: request.randomSeed,
+                    firstTokenGatingEnabled: request.isFirstTokenGatingEnabled,
+                    firstTokenConfidenceGatingEnabled: request.isFirstTokenConfidenceGatingEnabled,
+                    firstTokenConfidenceThreshold: request.firstTokenConfidenceThreshold
+                )
+            } catch let error as LlamaRuntimeError {
+                // Confidence suppression is a normal "no suggestion" outcome, not a failure.
+                // The runtime's own KV stays valid (it threw before any sampled-token decode), so
+                // we deliberately keep the prompt-cache hint tracker intact: the next request can
+                // still benefit from prefix reuse against this exact prompt.
+                if case .lowConfidenceSuppression = error {
+                    promptCacheHintTracker.recordSuccessfulRequest(request)
+                    return SuggestionResult(
+                        generation: request.generation,
+                        rawText: "",
+                        text: "",
+                        latency: Date().timeIntervalSince(startTime)
+                    )
+                }
+                throw error
+            }
             try Task.checkCancellation()
 
             promptCacheHintTracker.recordSuccessfulRequest(request)
diff --git a/tabby/Support/SuggestionRequestFactory.swift b/tabby/Support/SuggestionRequestFactory.swift
index 989f55a..791eca4 100644
--- a/tabby/Support/SuggestionRequestFactory.swift
+++ b/tabby/Support/SuggestionRequestFactory.swift
@@ -63,7 +63,9 @@ enum SuggestionRequestFactory {
             maxSuffixCharacters: configuration.maxSuffixCharacters,
             completionLengthInstruction: completionLengthInstruction,
             customAIInstructions: customAIInstructions,
-            isFirstTokenGatingEnabled: settings.isFirstTokenGatingEnabled
+            isFirstTokenGatingEnabled: settings.isFirstTokenGatingEnabled,
+            isFirstTokenConfidenceGatingEnabled: settings.isFirstTokenConfidenceGatingEnabled,
+            firstTokenConfidenceThreshold: settings.firstTokenConfidenceThreshold
         )
 
         return SuggestionRequestBuildResult(
diff --git a/tabby/UI/SettingsView.swift b/tabby/UI/SettingsView.swift
index c6180c9..bbfac18 100644
--- a/tabby/UI/SettingsView.swift
+++ b/tabby/UI/SettingsView.swift
@@ -182,6 +182,37 @@ struct SettingsView: View {
                 )
                     .font(.caption)
                     .foregroundStyle(.secondary)
+
+                Toggle("Suppress Low-Confidence Suggestions", isOn: firstTokenConfidenceGatingBinding)
+
+                if suggestionSettings.isFirstTokenConfidenceGatingEnabled {
+                    // Slider only renders when the gate is on so an inactive control doesn't
+                    // imply behavior. The 0...0.5 range covers the useful tuning band — local
+                    // models rarely peak above ~0.6 even on confident tokens, so going past
+                    // 0.5 would suppress almost everything.
+                    LabeledContent("Confidence Threshold") {
+                        HStack(spacing: 8) {
+                            Slider(
+                                value: firstTokenConfidenceThresholdBinding,
+                                in: 0.0 ... 0.5,
+                                step: 0.01
+                            )
+                            .frame(maxWidth: 220)
+
+                            Text(String(format: "%.2f", suggestionSettings.firstTokenConfidenceThreshold))
+                                .font(.callout.monospacedDigit())
+                                .frame(width: 40, alignment: .trailing)
+                        }
+                    }
+                }
+
+                Text(
+                    "Aborts the suggestion when the model's top choice for the first word "
+                    + "is below the threshold probability. Useful when prompts are ambiguous "
+                    + "and the model's distribution is flat. Open Source engine only."
+                )
+                    .font(.caption)
+                    .foregroundStyle(.secondary)
             } else {
                 Text("Completion Style and custom instructions apply to the Open Source engine.")
                     .font(.caption)
@@ -495,6 +526,24 @@ struct SettingsView: View {
         )
     }
 
+    private var firstTokenConfidenceGatingBinding: Binding<Bool> {
+        Binding(
+            get: { suggestionSettings.isFirstTokenConfidenceGatingEnabled },
+            set: { enabled in
+                suggestionSettings.setFirstTokenConfidenceGatingEnabled(enabled)
+            }
+        )
+    }
+
+    private var firstTokenConfidenceThresholdBinding: Binding<Double> {
+        Binding(
+            get: { suggestionSettings.firstTokenConfidenceThreshold },
+            set: { value in
+                suggestionSettings.setFirstTokenConfidenceThreshold(value)
+            }
+        )
+    }
+
     private var customAIInstructionsBinding: Binding<String> {
         Binding(
             get: { suggestionSettings.customAIInstructions },
diff --git a/tabbyTests/LlamaPromptRendererTests.swift b/tabbyTests/LlamaPromptRendererTests.swift
index 17f19ff..f40d15b 100644
--- a/tabbyTests/LlamaPromptRendererTests.swift
+++ b/tabbyTests/LlamaPromptRendererTests.swift
@@ -213,7 +213,9 @@ final class LlamaPromptRendererTests: XCTestCase {
             maxSuffixCharacters: 192,
             completionLengthInstruction: "Return only the next few words.",
             customAIInstructions: nil,
-            isFirstTokenGatingEnabled: true
+            isFirstTokenGatingEnabled: true,
+            isFirstTokenConfidenceGatingEnabled: false,
+            firstTokenConfidenceThreshold: 0.0
         )
     }
 }