Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 4 additions & 52 deletions src/api/providers/anthropic-vertex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import {
handleAiSdkError,
} from "../transform/ai-sdk"
import { calculateApiCostAnthropic } from "../../shared/cost"
import { applyCacheBreakpoints } from "../transform/cache-breakpoints"

import { DEFAULT_HEADERS } from "./constants"
import { BaseProvider } from "./base-provider"
Expand Down Expand Up @@ -124,36 +125,10 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
anthropicProviderOptions.disableParallelToolUse = true
}

/**
* Vertex API has specific limitations for prompt caching:
* 1. Maximum of 4 blocks can have cache_control
* 2. Only text blocks can be cached (images and other content types cannot)
* 3. Cache control can only be applied to user messages, not assistant messages
*
* Our caching strategy:
* - Cache the system prompt (1 block)
* - Cache the last text block of the second-to-last user message (1 block)
* - Cache the last text block of the last user message (1 block)
* This ensures we stay under the 4-block limit while maintaining effective caching
* for the most relevant context.
*/
// Apply cache control to user messages (Vertex allows up to 4 cache_control blocks;
// 1 for system prompt + 2 for the last 2 user message batches).
const cacheProviderOption = { anthropic: { cacheControl: { type: "ephemeral" as const } } }

const userMsgIndices = messages.reduce(
(acc, msg, index) => ("role" in msg && msg.role === "user" ? [...acc, index] : acc),
[] as number[],
)

const targetIndices = new Set<number>()
const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
const secondLastUserMsgIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1

if (lastUserMsgIndex >= 0) targetIndices.add(lastUserMsgIndex)
if (secondLastUserMsgIndex >= 0) targetIndices.add(secondLastUserMsgIndex)

if (targetIndices.size > 0) {
this.applyCacheControlToAiSdkMessages(messages as ModelMessage[], targetIndices, cacheProviderOption)
}
applyCacheBreakpoints(messages, { cacheProviderOption })

// Build streamText request
// Cast providerOptions to any to bypass strict JSONObject typing — the AI SDK accepts the correct runtime values
Expand Down Expand Up @@ -260,29 +235,6 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
}
}

/**
* Apply cacheControl providerOptions to the correct AI SDK messages by walking
* the original Anthropic messages and converted AI SDK messages in parallel.
*
* convertToAiSdkMessages() can split a single Anthropic user message (containing
* tool_results + text) into 2 AI SDK messages (tool role + user role). This method
* accounts for that split so cache control lands on the right message.
*/
private applyCacheControlToAiSdkMessages(
aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
targetIndices: Set<number>,
cacheProviderOption: Record<string, Record<string, unknown>>,
): void {
for (const idx of targetIndices) {
if (idx >= 0 && idx < aiSdkMessages.length) {
aiSdkMessages[idx].providerOptions = {
...aiSdkMessages[idx].providerOptions,
...cacheProviderOption,
}
}
}
}

getModel() {
const modelId = this.options.apiModelId
let id = modelId && modelId in vertexModels ? (modelId as VertexModelId) : vertexDefaultModelId
Expand Down
41 changes: 2 additions & 39 deletions src/api/providers/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
handleAiSdkError,
} from "../transform/ai-sdk"
import { calculateApiCostAnthropic } from "../../shared/cost"
import { applyCacheBreakpoints } from "../transform/cache-breakpoints"

import { DEFAULT_HEADERS } from "./constants"
import { BaseProvider } from "./base-provider"
Expand Down Expand Up @@ -114,22 +115,7 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
// Apply cache control to user messages
// Strategy: cache the last 2 user messages (write-to-cache + read-from-cache)
const cacheProviderOption = { anthropic: { cacheControl: { type: "ephemeral" as const } } }

const userMsgIndices = messages.reduce(
(acc, msg, index) => ("role" in msg && msg.role === "user" ? [...acc, index] : acc),
[] as number[],
)

const targetIndices = new Set<number>()
const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
const secondLastUserMsgIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1

if (lastUserMsgIndex >= 0) targetIndices.add(lastUserMsgIndex)
if (secondLastUserMsgIndex >= 0) targetIndices.add(secondLastUserMsgIndex)

if (targetIndices.size > 0) {
this.applyCacheControlToAiSdkMessages(messages as ModelMessage[], targetIndices, cacheProviderOption)
}
applyCacheBreakpoints(messages, { cacheProviderOption })

// Build streamText request
// Cast providerOptions to any to bypass strict JSONObject typing — the AI SDK accepts the correct runtime values
Expand Down Expand Up @@ -236,29 +222,6 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
}
}

/**
* Apply cacheControl providerOptions to the correct AI SDK messages by walking
* the original Anthropic messages and converted AI SDK messages in parallel.
*
* convertToAiSdkMessages() can split a single Anthropic user message (containing
* tool_results + text) into 2 AI SDK messages (tool role + user role). This method
* accounts for that split so cache control lands on the right message.
*/
private applyCacheControlToAiSdkMessages(
aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
targetIndices: Set<number>,
cacheProviderOption: Record<string, Record<string, unknown>>,
): void {
for (const idx of targetIndices) {
if (idx >= 0 && idx < aiSdkMessages.length) {
aiSdkMessages[idx].providerOptions = {
...aiSdkMessages[idx].providerOptions,
...cacheProviderOption,
}
}
}
}

getModel() {
const modelId = this.options.apiModelId
let id = modelId && modelId in anthropicModels ? (modelId as AnthropicModelId) : anthropicDefaultModelId
Expand Down
70 changes: 7 additions & 63 deletions src/api/providers/bedrock.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import {
handleAiSdkError,
} from "../transform/ai-sdk"
import { getModelParams } from "../transform/model-params"
import { applyCacheBreakpoints } from "../transform/cache-breakpoints"
import { shouldUseReasoningBudget } from "../../shared/api"
import { BaseProvider } from "./base-provider"
import { DEFAULT_HEADERS } from "./constants"
Expand Down Expand Up @@ -276,46 +277,12 @@ export class AwsBedrockHandler extends BaseProvider implements SingleCompletionH

if (usePromptCache) {
const cachePointOption = { bedrock: { cachePoint: { type: "default" as const } } }

// Find all user message indices in the original (pre-conversion) message array.
const originalUserIndices = filteredMessages.reduce<number[]>(
(acc, msg, idx) => ("role" in msg && msg.role === "user" ? [...acc, idx] : acc),
[],
)

// Select up to 3 user messages for cache points (system prompt uses the 4th):
// - Last user message: write to cache for next request
// - Second-to-last user message: read from cache for current request
// - An "anchor" message earlier in the conversation for 20-block window coverage
const targetOriginalIndices = new Set<number>()
const numUserMsgs = originalUserIndices.length

if (numUserMsgs >= 1) {
// Always cache the last user message
targetOriginalIndices.add(originalUserIndices[numUserMsgs - 1])
}
if (numUserMsgs >= 2) {
// Cache the second-to-last user message
targetOriginalIndices.add(originalUserIndices[numUserMsgs - 2])
}
if (numUserMsgs >= 5) {
// Add an anchor cache point roughly in the first third of user messages.
// This ensures that the 20-block lookback from the second-to-last breakpoint
// can find a stable cache entry, covering all the assistant and tool messages
// in the middle of the conversation. We pick the user message at ~1/3 position.
const anchorIdx = Math.floor(numUserMsgs / 3)
// Only add if it's not already one of the last-2 targets
if (!targetOriginalIndices.has(originalUserIndices[anchorIdx])) {
targetOriginalIndices.add(originalUserIndices[anchorIdx])
}
}

// Apply cachePoint to the correct AI SDK messages by walking both arrays in parallel.
// A single original user message with tool_results becomes [tool-role msg, user-role msg]
// in the AI SDK array, while a plain user message becomes [user-role msg].
if (targetOriginalIndices.size > 0) {
this.applyCachePointsToAiSdkMessages(aiSdkMessages, targetOriginalIndices, cachePointOption)
}
applyCacheBreakpoints(aiSdkMessages as RooMessage[], {
cacheProviderOption: cachePointOption,
maxMessageBreakpoints: 3,
useAnchor: true,
anchorThreshold: 5,
})
}

// Build streamText request
Expand Down Expand Up @@ -734,29 +701,6 @@ export class AwsBedrockHandler extends BaseProvider implements SingleCompletionH
)
}

/**
* Apply cachePoint providerOptions to the correct AI SDK messages by walking
* the original Anthropic messages and converted AI SDK messages in parallel.
*
* convertToAiSdkMessages() can split a single Anthropic user message (containing
* tool_results + text) into 2 AI SDK messages (tool role + user role). This method
* accounts for that split so cache points land on the right message.
*/
private applyCachePointsToAiSdkMessages(
aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
targetIndices: Set<number>,
cachePointOption: Record<string, Record<string, unknown>>,
): void {
for (const idx of targetIndices) {
if (idx >= 0 && idx < aiSdkMessages.length) {
aiSdkMessages[idx].providerOptions = {
...aiSdkMessages[idx].providerOptions,
...cachePointOption,
}
}
}
}

/************************************************************************************
*
* AMAZON REGIONS
Expand Down
32 changes: 2 additions & 30 deletions src/api/providers/minimax.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import {
handleAiSdkError,
} from "../transform/ai-sdk"
import { calculateApiCostAnthropic } from "../../shared/cost"
import { applyCacheBreakpoints } from "../transform/cache-breakpoints"

import { DEFAULT_HEADERS } from "./constants"
import { BaseProvider } from "./base-provider"
Expand Down Expand Up @@ -95,21 +96,7 @@ export class MiniMaxHandler extends BaseProvider implements SingleCompletionHand
}

const cacheProviderOption = { anthropic: { cacheControl: { type: "ephemeral" as const } } }
const userMsgIndices = mergedMessages.reduce(
(acc, msg, index) => (msg.role === "user" ? [...acc, index] : acc),
[] as number[],
)

const targetIndices = new Set<number>()
const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
const secondLastUserMsgIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1

if (lastUserMsgIndex >= 0) targetIndices.add(lastUserMsgIndex)
if (secondLastUserMsgIndex >= 0) targetIndices.add(secondLastUserMsgIndex)

if (targetIndices.size > 0) {
this.applyCacheControlToAiSdkMessages(aiSdkMessages, targetIndices, cacheProviderOption)
}
applyCacheBreakpoints(aiSdkMessages as RooMessage[], { cacheProviderOption })

const requestOptions = {
model: this.client(modelConfig.id),
Expand Down Expand Up @@ -212,21 +199,6 @@ export class MiniMaxHandler extends BaseProvider implements SingleCompletionHand
}
}

private applyCacheControlToAiSdkMessages(
aiSdkMessages: { role: string; providerOptions?: Record<string, Record<string, unknown>> }[],
targetIndices: Set<number>,
cacheProviderOption: Record<string, Record<string, unknown>>,
): void {
for (const idx of targetIndices) {
if (idx >= 0 && idx < aiSdkMessages.length) {
aiSdkMessages[idx].providerOptions = {
...aiSdkMessages[idx].providerOptions,
...cacheProviderOption,
}
}
}
}

getModel() {
const modelId = this.options.apiModelId

Expand Down
Loading
Loading