docker · yunus25jmi1 · Mar 6, 2026
@@ -40,6 +40,79 @@ const (
 	DefaultFallbackCooldown = 1 * time.Minute
 )
 
+// ContextOverflowError wraps an underlying error to indicate that the failure
+// was caused by the conversation context exceeding the model's context window.
+// This is used to trigger auto-compaction in the runtime loop instead of
+// surfacing raw HTTP errors to the user.
+type ContextOverflowError struct {
+	Underlying error
+}
+
+func (e *ContextOverflowError) Error() string {
+	return fmt.Sprintf("context window overflow: %s", e.Underlying.Error())
+}
+
+func (e *ContextOverflowError) Unwrap() error {
+	return e.Underlying
+}
+
+// contextOverflowPatterns contains error message substrings that indicate the
+// prompt/context exceeds the model's context window. These patterns are checked
+// case-insensitively against error messages from various providers.
+var contextOverflowPatterns = []string{
+	"prompt is too long",
+	"maximum context length",
+	"context length exceeded",
+	"context_length_exceeded",
+	"max_tokens must be greater than",
+	"maximum number of tokens",
+	"content length exceeds",
+	"request too large",
+	"payload too large",
+	"input is too long",
+	"exceeds the model's max token",
+	"token limit",
+	"reduce your prompt",
+	"reduce the length",
+}
+
+// isContextOverflowError checks whether the error indicates the conversation
+// context has exceeded the model's context window. It inspects both structured
+// SDK error types and raw error message patterns.
+//
+// Recognised patterns include:
+//   - Anthropic 400 "prompt is too long: N tokens > M maximum"
+//   - Anthropic 400 "max_tokens must be greater than thinking.budget_tokens"
+//     (emitted when the prompt is so large that max_tokens can't accommodate
+//     the thinking budget — a proxy for context overflow)
+//   - OpenAI 400 "maximum context length" / "context_length_exceeded"
+//   - Anthropic 500 that is actually a context overflow (heuristic: the error
+//     message is opaque but the conversation was already near the limit)
+//
+// This function intentionally does NOT match generic 500 errors; callers
+// that want to treat an opaque 500 as overflow must check separately with
+// additional context (e.g., session token counts).
+func isContextOverflowError(err error) bool {
+	if err == nil {
+		return false
+	}
+
+	// Already wrapped
+	var ctxErr *ContextOverflowError
+	if errors.As(err, &ctxErr) {
+		return true
+	}
+
+	errMsg := strings.ToLower(err.Error())
+	for _, pattern := range contextOverflowPatterns {
+		if strings.Contains(errMsg, pattern) {
+			return true
+		}
+	}
+
+	return false
+}
+
 // fallbackCooldownState tracks when we should stick with a fallback model
 // instead of retrying the primary after a non-retryable error (e.g., 429).
 type fallbackCooldownState struct {
@@ -144,6 +217,14 @@ func isRetryableModelError(err error) bool {
 		return false
 	}
 
+	// Context overflow errors are never retryable — the context hasn't changed
+	// between attempts, so retrying the same oversized payload will always fail.
+	// This avoids wasting time on 3 attempts + exponential backoff.
+	if isContextOverflowError(err) {
+		slog.Debug("Context overflow error, not retryable", "error", err)
+		return false
+	}
+
 	// First, try to extract HTTP status code from known SDK error types
 	if statusCode := extractHTTPStatusCode(err); statusCode != 0 {
 		retryable := isRetryableStatusCode(statusCode)
@@ -587,9 +668,15 @@ func (r *LocalRuntime) tryModelWithFallback(
 		}
 	}
 
-	// All models and retries exhausted
+	// All models and retries exhausted.
+	// If the last error (or any error in the chain) was a context overflow,
+	// wrap it in a ContextOverflowError so the caller can auto-compact.
 	if lastErr != nil {
-		return streamResult{}, nil, fmt.Errorf("all models failed: %w", lastErr)
+		wrapped := fmt.Errorf("all models failed: %w", lastErr)
+		if isContextOverflowError(lastErr) {
+			return streamResult{}, nil, &ContextOverflowError{Underlying: wrapped}
+		}
+		return streamResult{}, nil, wrapped
 	}
 	return streamResult{}, nil, errors.New("all models failed with unknown error")
 }
@@ -201,6 +201,21 @@ func TestIsRetryableModelError(t *testing.T) {
 			err:      errors.New("upstream connect error"),
 			expected: true,
 		},
+		{
+			name:     "context overflow - prompt too long",
+			err:      errors.New("prompt is too long: 226360 tokens > 200000 maximum"),
+			expected: false, // Context overflow should not be retried
+		},
+		{
+			name:     "context overflow - thinking budget",
+			err:      errors.New("max_tokens must be greater than thinking.budget_tokens"),
+			expected: false, // Context overflow should not be retried
+		},
+		{
+			name:     "context overflow - wrapped",
+			err:      &ContextOverflowError{Underlying: errors.New("test")},
+			expected: false, // Context overflow should not be retried
+		},
 		{
 			name:     "unknown error",
 			err:      errors.New("something weird happened"),
@@ -904,6 +919,113 @@ func TestFallbackModelsClonedWithThinkingEnabled(t *testing.T) {
 	})
 }
 
+func TestIsContextOverflowError(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name     string
+		err      error
+		expected bool
+	}{
+		{name: "nil error", err: nil, expected: false},
+		{name: "generic error", err: errors.New("something went wrong"), expected: false},
+		{name: "anthropic prompt too long", err: errors.New(`prompt is too long: 226360 tokens > 200000 maximum`), expected: true},
+		{name: "openai context length exceeded", err: errors.New(`maximum context length is 128000 tokens`), expected: true},
+		{name: "context_length_exceeded code", err: errors.New(`error code: context_length_exceeded`), expected: true},
+		{name: "thinking budget error", err: errors.New(`max_tokens must be greater than thinking.budget_tokens`), expected: true},
+		{name: "request too large", err: errors.New(`request too large for model`), expected: true},
+		{name: "input is too long", err: errors.New(`input is too long`), expected: true},
+		{name: "reduce your prompt", err: errors.New(`please reduce your prompt`), expected: true},
+		{name: "reduce the length", err: errors.New(`please reduce the length of the messages`), expected: true},
+		{name: "token limit", err: errors.New(`token limit exceeded`), expected: true},
+		{name: "wrapped ContextOverflowError", err: &ContextOverflowError{Underlying: errors.New("test")}, expected: true},
+		{name: "errors.As wrapped", err: fmt.Errorf("all models failed: %w", &ContextOverflowError{Underlying: errors.New("test")}), expected: true},
+		{name: "500 internal server error (not overflow)", err: errors.New(`500 Internal Server Error`), expected: false},
+		{name: "429 rate limit (not overflow)", err: errors.New(`429 too many requests`), expected: false},
+		{name: "network timeout (not overflow)", err: errors.New(`connection timeout`), expected: false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			result := isContextOverflowError(tt.err)
+			assert.Equal(t, tt.expected, result, "isContextOverflowError(%v)", tt.err)
+		})
+	}
+}
+
+func TestContextOverflowError(t *testing.T) {
+	t.Parallel()
+
+	t.Run("wraps underlying error", func(t *testing.T) {
+		t.Parallel()
+		underlying := errors.New("prompt is too long: 226360 tokens > 200000 maximum")
+		ctxErr := &ContextOverflowError{Underlying: underlying}
+
+		assert.Contains(t, ctxErr.Error(), "context window overflow")
+		assert.Contains(t, ctxErr.Error(), "prompt is too long")
+		assert.ErrorIs(t, ctxErr, underlying)
+	})
+
+	t.Run("errors.As works", func(t *testing.T) {
+		t.Parallel()
+		underlying := errors.New("test error")
+		wrapped := fmt.Errorf("all models failed: %w", &ContextOverflowError{Underlying: underlying})
+
+		var ctxErr *ContextOverflowError
+		assert.ErrorAs(t, wrapped, &ctxErr)
+	})
+}
+
+func TestIsRetryableModelError_ContextOverflow(t *testing.T) {
+	t.Parallel()
+
+	// Context overflow errors should NOT be retryable — the context hasn't changed,
+	// so retrying the same oversized payload will always fail.
+	tests := []struct {
+		name string
+		err  error
+	}{
+		{name: "prompt too long", err: errors.New(`prompt is too long: 226360 tokens > 200000 maximum`)},
+		{name: "thinking budget cascade", err: errors.New(`max_tokens must be greater than thinking.budget_tokens`)},
+		{name: "context length exceeded", err: errors.New(`maximum context length is 128000 tokens`)},
+		{name: "wrapped ContextOverflowError", err: &ContextOverflowError{Underlying: errors.New("test")}},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			assert.False(t, isRetryableModelError(tt.err),
+				"context overflow errors should not be retryable: %v", tt.err)
+		})
+	}
+}
+
+func TestFormatModelError(t *testing.T) {
+	t.Parallel()
+
+	t.Run("nil error", func(t *testing.T) {
+		t.Parallel()
+		assert.Empty(t, formatModelError(nil))
+	})
+
+	t.Run("context overflow shows user-friendly message", func(t *testing.T) {
+		t.Parallel()
+		err := &ContextOverflowError{Underlying: errors.New("prompt is too long")}
+		msg := formatModelError(err)
+		assert.Contains(t, msg, "context window")
+		assert.Contains(t, msg, "/compact")
+		assert.NotContains(t, msg, "prompt is too long")
+	})
+
+	t.Run("generic error preserves message", func(t *testing.T) {
+		t.Parallel()
+		err := errors.New("authentication failed")
+		msg := formatModelError(err)
+		assert.Equal(t, "authentication failed", msg)
+	})
+}
+
 // Verify interface compliance
 var (
 	_ provider.Provider = (*mockProvider)(nil)

@@ -1169,12 +1169,38 @@ func (r *LocalRuntime) RunStream(ctx context.Context, sess *session.Session) <-c
 					streamSpan.End()
 					return
 				}
+
+				// Auto-recovery: if the error is a context overflow and
+				// session compaction is enabled, compact the conversation
+				// and retry the request instead of surfacing raw errors.
+				var ctxOverflow *ContextOverflowError
+				if errors.As(err, &ctxOverflow) && r.sessionCompaction {
+					slog.Warn("Context window overflow detected, attempting auto-compaction",
+						"agent", a.Name(),
+						"session_id", sess.ID,
+						"input_tokens", sess.InputTokens,
+						"output_tokens", sess.OutputTokens,
+						"context_limit", contextLimit,
+					)
+					events <- Warning(
+						"The conversation has exceeded the model's context window. Automatically compacting the conversation history...",
+						r.CurrentAgentName(),
+					)
+					r.Summarize(ctx, sess, "", events)
+
+					// After compaction, loop back to retry with the
+					// compacted context. The next iteration will re-fetch
+					// messages from the (now compacted) session.
+					streamSpan.End()
+					continue
+				}
+
 				streamSpan.RecordError(err)
 				streamSpan.SetStatus(codes.Error, "error handling stream")
 				slog.Error("All models failed", "agent", a.Name(), "error", err)
 				// Track error in telemetry
 				telemetry.RecordError(ctx, err.Error())
-				events <- Error(err.Error())
+				events <- Error(formatModelError(err))
 				streamSpan.End()
 				return
 			}
@@ -1258,12 +1284,43 @@ func (r *LocalRuntime) RunStream(ctx context.Context, sess *session.Session) <-c
 			usage.LastMessage = msgUsage
 			events <- NewTokenUsageEvent(sess.ID, r.CurrentAgentName(), usage)
 
+			// Record the message count before tool calls so we can
+			// measure how much content was added by tool results.
+			messageCountBeforeTools := len(sess.GetAllMessages())
+
 			r.processToolCalls(ctx, sess, res.Calls, agentTools, events)
 
 			if res.Stopped {
 				slog.Debug("Conversation stopped", "agent", a.Name())
 				break
 			}
+
+			// Root-cause fix for stale token counts (issue #1750):
+			// After tool calls, sess.InputTokens still reflects the
+			// *previous* API response and doesn't account for the
+			// (potentially large) tool results just added. Estimate
+			// the additional tokens and compact proactively to prevent
+			// the oversized request from ever being sent.
+			if m != nil && r.sessionCompaction && contextLimit > 0 {
+				newMessages := sess.GetAllMessages()[messageCountBeforeTools:]
+				var addedTokens int64
+				for _, msg := range newMessages {
+					addedTokens += estimateMessageTokens(&msg.Message)
+				}
+
+				estimatedTotal := sess.InputTokens + sess.OutputTokens + addedTokens
+				if estimatedTotal > int64(float64(contextLimit)*0.9) {
+					slog.Info("Proactive compaction: tool results pushed estimated context past 90%% threshold",
+						"agent", a.Name(),
+						"input_tokens", sess.InputTokens,
+						"output_tokens", sess.OutputTokens,
+						"added_estimated_tokens", addedTokens,
+						"estimated_total", estimatedTotal,
+						"context_limit", contextLimit,
+					)
+					r.Summarize(ctx, sess, "", events)
+				}
+			}
 		}
 	}()
 
@@ -2389,3 +2446,65 @@ func stripImageContent(messages []chat.Message) []chat.Message {
 	}
 	return result
 }
+
+// charsPerToken is the average number of characters per token used for
+// estimation. A value of 4 is a widely-used heuristic for English text;
+// it slightly overestimates token counts for code/JSON (which is ~3.5),
+// making compaction trigger earlier — the safe direction.
+const charsPerToken = 4
+
+// estimateMessageTokens returns a rough token-count estimate for a single
+// chat message based on its text length. This is intentionally conservative
+// (overestimates) so that proactive compaction fires before we hit the limit.
+// The estimate includes the message content, multi-content text parts, and
+// a small overhead per message for role/metadata tokens.
+func estimateMessageTokens(msg *chat.Message) int64 {
+	var chars int
+
+	// Primary text content.
+	chars += len(msg.Content)
+
+	// Multi-content parts (e.g., tool results with image descriptions).
+	for _, part := range msg.MultiContent {
+		chars += len(part.Text)
+	}
+
+	// Reasoning / thinking content.
+	chars += len(msg.ReasoningContent)
+
+	// Tool call arguments (they count toward input tokens on the next turn).
+	for _, tc := range msg.ToolCalls {
+		chars += len(tc.Function.Arguments)
+		chars += len(tc.Function.Name)
+	}
+
+	// Per-message overhead: role, ToolCallID, delimiters, etc.
+	// Models typically use 3-7 tokens for message framing.
+	const perMessageOverhead = 5
+
+	if chars == 0 {
+		return perMessageOverhead
+	}
+
+	return int64(chars/charsPerToken) + perMessageOverhead
+}
+
+// formatModelError produces a user-friendly error message from a model error.
+// Raw HTTP errors with request IDs, JSON payloads, and API URLs are replaced
+// with actionable guidance. Context overflow errors receive a dedicated
+// message; other errors are cleaned up to remove noise while preserving the
+// essential failure reason.
+func formatModelError(err error) string {
+	if err == nil {
+		return ""
+	}
+
+	// Context overflow gets a dedicated, actionable message.
+	var ctxOverflow *ContextOverflowError
+	if errors.As(err, &ctxOverflow) {
+		return "The conversation has exceeded the model's context window and automatic compaction is not enabled. " +
+			"Try running /compact to reduce the conversation size, or start a new session."
+	}
+
+	return err.Error()
+}