continuedev · MukundaKatta · May 28, 2026
@@ -15,7 +15,7 @@ import { DevDataSqliteDb } from "./data/devdataSqlite";
 import { DataLogger } from "./data/log";
 import { CodebaseIndexer } from "./indexing/CodebaseIndexer";
 import DocsService from "./indexing/docs/DocsService";
-import { countTokens } from "./llm/countTokens";
+import { countTokens, getAvailableInputTokens } from "./llm/countTokens";
 import Lemonade from "./llm/llms/Lemonade";
 import { fetchModels } from "./llm/fetchModels";
 import Ollama from "./llm/llms/Ollama";
@@ -1296,12 +1296,12 @@ export class Core {
     }
 
     const tokens = countTokens(item.content, llm.model);
+    const availableTokens = getAvailableInputTokens(
+      llm.contextLength,
+      llm.completionOptions!.maxTokens!,
+    );
 
-    if (tokens > llm.contextLength - llm.completionOptions!.maxTokens!) {
-      return true;
-    }
-
-    return false;
+    return tokens > availableTokens;
   }
 
   private handleAddAutocompleteModel(

@@ -7,6 +7,7 @@ import {
   countTokens,
   countTokensAsync,
   extractToolSequence,
+  getAvailableInputTokens,
   pruneLinesFromBottom,
   pruneLinesFromTop,
   pruneRawPromptFromTop,
@@ -28,6 +29,34 @@ describe.skip("countTokens", () => {
   });
 });
 
+describe("getAvailableInputTokens", () => {
+  it("reserves only a minimum response allowance, not the full maxTokens", () => {
+    // Small context window with the default 4096 maxTokens (e.g. a local model
+    // with no known completion limit). The old guard reserved the full 4096,
+    // leaving only contextLength - 4096; this reserves MIN_RESPONSE_TOKENS (1000)
+    // plus the safety buffer instead.
+    const contextLength = 8192;
+    const maxTokens = 4096;
+
+    const available = getAvailableInputTokens(contextLength, maxTokens);
+
+    // Far more headroom than the old `contextLength - maxTokens` formula.
+    expect(available).toBeGreaterThan(contextLength - maxTokens);
+    // Safety buffer is min(1000, 8192 * 0.02 = 163.84) = 163.84, response = 1000.
+    expect(available).toBeCloseTo(8192 - 163.84 - 1000, 2);
+  });
+
+  it("reserves maxTokens when it is below the minimum response allowance", () => {
+    const contextLength = 100_000;
+    const maxTokens = 256;
+
+    const available = getAvailableInputTokens(contextLength, maxTokens);
+
+    // Safety buffer caps at 1000; response reservation is the smaller maxTokens.
+    expect(available).toBe(100_000 - 1000 - 256);
+  });
+});
+
 describe("countTokensAsync", () => {
   afterAll(async () => {
     // Clean up the global async encoders to prevent Jest from hanging

@@ -376,6 +376,26 @@ export function getTokenCountingBufferSafety(contextLength: number) {
 
 const MIN_RESPONSE_TOKENS = 1000;
 
+/**
+ * Tokens available for input content in a single request. This mirrors how
+ * compileChatMessages budgets a request: reserve the counting safety buffer
+ * plus a minimum response allowance, rather than the full configured
+ * completion budget (`maxTokens`). Reserving the full `maxTokens` here makes a
+ * single context item appear too big on small context windows or on models
+ * with no known completion limit (where `maxTokens` defaults to 4096), even
+ * though the message compiler would happily include it.
+ */
+export function getAvailableInputTokens(
+  contextLength: number,
+  maxTokens: number,
+): number {
+  return (
+    contextLength -
+    getTokenCountingBufferSafety(contextLength) -
+    Math.min(MIN_RESPONSE_TOKENS, maxTokens)
+  );
+}
+
 function pruneRawPromptFromTop(
   modelName: string,
   contextLength: number,