Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 89 additions & 2 deletions pkg/runtime/fallback.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,79 @@ const (
DefaultFallbackCooldown = 1 * time.Minute
)

// ContextOverflowError wraps an underlying error to indicate that the failure
// was caused by the conversation context exceeding the model's context window.
// This is used to trigger auto-compaction in the runtime loop instead of
// surfacing raw HTTP errors to the user.
type ContextOverflowError struct {
Underlying error
}

func (e *ContextOverflowError) Error() string {
return fmt.Sprintf("context window overflow: %s", e.Underlying.Error())
}

func (e *ContextOverflowError) Unwrap() error {
return e.Underlying
}

// contextOverflowPatterns contains error message substrings that indicate the
// prompt/context exceeds the model's context window. These patterns are checked
// case-insensitively against error messages from various providers.
var contextOverflowPatterns = []string{
"prompt is too long",
"maximum context length",
"context length exceeded",
"context_length_exceeded",
"max_tokens must be greater than",
"maximum number of tokens",
"content length exceeds",
"request too large",
"payload too large",
"input is too long",
"exceeds the model's max token",
"token limit",
"reduce your prompt",
"reduce the length",
}

// isContextOverflowError checks whether the error indicates the conversation
// context has exceeded the model's context window. It inspects both structured
// SDK error types and raw error message patterns.
//
// Recognised patterns include:
// - Anthropic 400 "prompt is too long: N tokens > M maximum"
// - Anthropic 400 "max_tokens must be greater than thinking.budget_tokens"
// (emitted when the prompt is so large that max_tokens can't accommodate
// the thinking budget — a proxy for context overflow)
// - OpenAI 400 "maximum context length" / "context_length_exceeded"
// - Anthropic 500 that is actually a context overflow (heuristic: the error
// message is opaque but the conversation was already near the limit)
//
// This function intentionally does NOT match generic 500 errors; callers
// that want to treat an opaque 500 as overflow must check separately with
// additional context (e.g., session token counts).
func isContextOverflowError(err error) bool {
if err == nil {
return false
}

// Already wrapped
var ctxErr *ContextOverflowError
if errors.As(err, &ctxErr) {
return true
}

errMsg := strings.ToLower(err.Error())
for _, pattern := range contextOverflowPatterns {
if strings.Contains(errMsg, pattern) {
return true
}
}

return false
}

// fallbackCooldownState tracks when we should stick with a fallback model
// instead of retrying the primary after a non-retryable error (e.g., 429).
type fallbackCooldownState struct {
Expand Down Expand Up @@ -144,6 +217,14 @@ func isRetryableModelError(err error) bool {
return false
}

// Context overflow errors are never retryable — the context hasn't changed
// between attempts, so retrying the same oversized payload will always fail.
// This avoids wasting time on 3 attempts + exponential backoff.
if isContextOverflowError(err) {
slog.Debug("Context overflow error, not retryable", "error", err)
return false
}

// First, try to extract HTTP status code from known SDK error types
if statusCode := extractHTTPStatusCode(err); statusCode != 0 {
retryable := isRetryableStatusCode(statusCode)
Expand Down Expand Up @@ -587,9 +668,15 @@ func (r *LocalRuntime) tryModelWithFallback(
}
}

// All models and retries exhausted
// All models and retries exhausted.
// If the last error (or any error in the chain) was a context overflow,
// wrap it in a ContextOverflowError so the caller can auto-compact.
if lastErr != nil {
return streamResult{}, nil, fmt.Errorf("all models failed: %w", lastErr)
wrapped := fmt.Errorf("all models failed: %w", lastErr)
if isContextOverflowError(lastErr) {
return streamResult{}, nil, &ContextOverflowError{Underlying: wrapped}
}
return streamResult{}, nil, wrapped
}
return streamResult{}, nil, errors.New("all models failed with unknown error")
}
122 changes: 122 additions & 0 deletions pkg/runtime/fallback_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,21 @@ func TestIsRetryableModelError(t *testing.T) {
err: errors.New("upstream connect error"),
expected: true,
},
{
name: "context overflow - prompt too long",
err: errors.New("prompt is too long: 226360 tokens > 200000 maximum"),
expected: false, // Context overflow should not be retried
},
{
name: "context overflow - thinking budget",
err: errors.New("max_tokens must be greater than thinking.budget_tokens"),
expected: false, // Context overflow should not be retried
},
{
name: "context overflow - wrapped",
err: &ContextOverflowError{Underlying: errors.New("test")},
expected: false, // Context overflow should not be retried
},
{
name: "unknown error",
err: errors.New("something weird happened"),
Expand Down Expand Up @@ -904,6 +919,113 @@ func TestFallbackModelsClonedWithThinkingEnabled(t *testing.T) {
})
}

func TestIsContextOverflowError(t *testing.T) {
t.Parallel()

tests := []struct {
name string
err error
expected bool
}{
{name: "nil error", err: nil, expected: false},
{name: "generic error", err: errors.New("something went wrong"), expected: false},
{name: "anthropic prompt too long", err: errors.New(`prompt is too long: 226360 tokens > 200000 maximum`), expected: true},
{name: "openai context length exceeded", err: errors.New(`maximum context length is 128000 tokens`), expected: true},
{name: "context_length_exceeded code", err: errors.New(`error code: context_length_exceeded`), expected: true},
{name: "thinking budget error", err: errors.New(`max_tokens must be greater than thinking.budget_tokens`), expected: true},
{name: "request too large", err: errors.New(`request too large for model`), expected: true},
{name: "input is too long", err: errors.New(`input is too long`), expected: true},
{name: "reduce your prompt", err: errors.New(`please reduce your prompt`), expected: true},
{name: "reduce the length", err: errors.New(`please reduce the length of the messages`), expected: true},
{name: "token limit", err: errors.New(`token limit exceeded`), expected: true},
{name: "wrapped ContextOverflowError", err: &ContextOverflowError{Underlying: errors.New("test")}, expected: true},
{name: "errors.As wrapped", err: fmt.Errorf("all models failed: %w", &ContextOverflowError{Underlying: errors.New("test")}), expected: true},
{name: "500 internal server error (not overflow)", err: errors.New(`500 Internal Server Error`), expected: false},
{name: "429 rate limit (not overflow)", err: errors.New(`429 too many requests`), expected: false},
{name: "network timeout (not overflow)", err: errors.New(`connection timeout`), expected: false},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
result := isContextOverflowError(tt.err)
assert.Equal(t, tt.expected, result, "isContextOverflowError(%v)", tt.err)
})
}
}

func TestContextOverflowError(t *testing.T) {
t.Parallel()

t.Run("wraps underlying error", func(t *testing.T) {
t.Parallel()
underlying := errors.New("prompt is too long: 226360 tokens > 200000 maximum")
ctxErr := &ContextOverflowError{Underlying: underlying}

assert.Contains(t, ctxErr.Error(), "context window overflow")
assert.Contains(t, ctxErr.Error(), "prompt is too long")
assert.ErrorIs(t, ctxErr, underlying)
})

t.Run("errors.As works", func(t *testing.T) {
t.Parallel()
underlying := errors.New("test error")
wrapped := fmt.Errorf("all models failed: %w", &ContextOverflowError{Underlying: underlying})

var ctxErr *ContextOverflowError
assert.ErrorAs(t, wrapped, &ctxErr)
})
}

func TestIsRetryableModelError_ContextOverflow(t *testing.T) {
t.Parallel()

// Context overflow errors should NOT be retryable — the context hasn't changed,
// so retrying the same oversized payload will always fail.
tests := []struct {
name string
err error
}{
{name: "prompt too long", err: errors.New(`prompt is too long: 226360 tokens > 200000 maximum`)},
{name: "thinking budget cascade", err: errors.New(`max_tokens must be greater than thinking.budget_tokens`)},
{name: "context length exceeded", err: errors.New(`maximum context length is 128000 tokens`)},
{name: "wrapped ContextOverflowError", err: &ContextOverflowError{Underlying: errors.New("test")}},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
assert.False(t, isRetryableModelError(tt.err),
"context overflow errors should not be retryable: %v", tt.err)
})
}
}

func TestFormatModelError(t *testing.T) {
t.Parallel()

t.Run("nil error", func(t *testing.T) {
t.Parallel()
assert.Empty(t, formatModelError(nil))
})

t.Run("context overflow shows user-friendly message", func(t *testing.T) {
t.Parallel()
err := &ContextOverflowError{Underlying: errors.New("prompt is too long")}
msg := formatModelError(err)
assert.Contains(t, msg, "context window")
assert.Contains(t, msg, "/compact")
assert.NotContains(t, msg, "prompt is too long")
})

t.Run("generic error preserves message", func(t *testing.T) {
t.Parallel()
err := errors.New("authentication failed")
msg := formatModelError(err)
assert.Equal(t, "authentication failed", msg)
})
}

// Verify interface compliance
var (
_ provider.Provider = (*mockProvider)(nil)
Expand Down
121 changes: 120 additions & 1 deletion pkg/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -1169,12 +1169,38 @@ func (r *LocalRuntime) RunStream(ctx context.Context, sess *session.Session) <-c
streamSpan.End()
return
}

// Auto-recovery: if the error is a context overflow and
// session compaction is enabled, compact the conversation
// and retry the request instead of surfacing raw errors.
var ctxOverflow *ContextOverflowError
if errors.As(err, &ctxOverflow) && r.sessionCompaction {
slog.Warn("Context window overflow detected, attempting auto-compaction",
"agent", a.Name(),
"session_id", sess.ID,
"input_tokens", sess.InputTokens,
"output_tokens", sess.OutputTokens,
"context_limit", contextLimit,
)
events <- Warning(
"The conversation has exceeded the model's context window. Automatically compacting the conversation history...",
r.CurrentAgentName(),
)
r.Summarize(ctx, sess, "", events)

// After compaction, loop back to retry with the
// compacted context. The next iteration will re-fetch
// messages from the (now compacted) session.
streamSpan.End()
continue
}

streamSpan.RecordError(err)
streamSpan.SetStatus(codes.Error, "error handling stream")
slog.Error("All models failed", "agent", a.Name(), "error", err)
// Track error in telemetry
telemetry.RecordError(ctx, err.Error())
events <- Error(err.Error())
events <- Error(formatModelError(err))
streamSpan.End()
return
}
Expand Down Expand Up @@ -1258,12 +1284,43 @@ func (r *LocalRuntime) RunStream(ctx context.Context, sess *session.Session) <-c
usage.LastMessage = msgUsage
events <- NewTokenUsageEvent(sess.ID, r.CurrentAgentName(), usage)

// Record the message count before tool calls so we can
// measure how much content was added by tool results.
messageCountBeforeTools := len(sess.GetAllMessages())

r.processToolCalls(ctx, sess, res.Calls, agentTools, events)

if res.Stopped {
slog.Debug("Conversation stopped", "agent", a.Name())
break
}

// Root-cause fix for stale token counts (issue #1750):
// After tool calls, sess.InputTokens still reflects the
// *previous* API response and doesn't account for the
// (potentially large) tool results just added. Estimate
// the additional tokens and compact proactively to prevent
// the oversized request from ever being sent.
if m != nil && r.sessionCompaction && contextLimit > 0 {
newMessages := sess.GetAllMessages()[messageCountBeforeTools:]
var addedTokens int64
for _, msg := range newMessages {
addedTokens += estimateMessageTokens(&msg.Message)
}

estimatedTotal := sess.InputTokens + sess.OutputTokens + addedTokens
if estimatedTotal > int64(float64(contextLimit)*0.9) {
slog.Info("Proactive compaction: tool results pushed estimated context past 90%% threshold",
"agent", a.Name(),
"input_tokens", sess.InputTokens,
"output_tokens", sess.OutputTokens,
"added_estimated_tokens", addedTokens,
"estimated_total", estimatedTotal,
"context_limit", contextLimit,
)
r.Summarize(ctx, sess, "", events)
}
}
}
}()

Expand Down Expand Up @@ -2389,3 +2446,65 @@ func stripImageContent(messages []chat.Message) []chat.Message {
}
return result
}

// charsPerToken is the average number of characters per token used for
// estimation. A value of 4 is a widely-used heuristic for English text;
// it slightly overestimates token counts for code/JSON (which is ~3.5),
// making compaction trigger earlier — the safe direction.
const charsPerToken = 4

// estimateMessageTokens returns a rough token-count estimate for a single
// chat message based on its text length. This is intentionally conservative
// (overestimates) so that proactive compaction fires before we hit the limit.
// The estimate includes the message content, multi-content text parts, and
// a small overhead per message for role/metadata tokens.
func estimateMessageTokens(msg *chat.Message) int64 {
var chars int

// Primary text content.
chars += len(msg.Content)

// Multi-content parts (e.g., tool results with image descriptions).
for _, part := range msg.MultiContent {
chars += len(part.Text)
}

// Reasoning / thinking content.
chars += len(msg.ReasoningContent)

// Tool call arguments (they count toward input tokens on the next turn).
for _, tc := range msg.ToolCalls {
chars += len(tc.Function.Arguments)
chars += len(tc.Function.Name)
}

// Per-message overhead: role, ToolCallID, delimiters, etc.
// Models typically use 3-7 tokens for message framing.
const perMessageOverhead = 5

if chars == 0 {
return perMessageOverhead
}

return int64(chars/charsPerToken) + perMessageOverhead
}

// formatModelError produces a user-friendly error message from a model error.
// Raw HTTP errors with request IDs, JSON payloads, and API URLs are replaced
// with actionable guidance. Context overflow errors receive a dedicated
// message; other errors are cleaned up to remove noise while preserving the
// essential failure reason.
func formatModelError(err error) string {
if err == nil {
return ""
}

// Context overflow gets a dedicated, actionable message.
var ctxOverflow *ContextOverflowError
if errors.As(err, &ctxOverflow) {
return "The conversation has exceeded the model's context window and automatic compaction is not enabled. " +
"Try running /compact to reduce the conversation size, or start a new session."
}

return err.Error()
}
Loading