Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions python/samples/concepts/caching/anthropic_prompt_caching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) Microsoft. All rights reserved.

# This sample demonstrates Anthropic prompt caching with Semantic Kernel.
# Prompt caching lets you mark parts of a request (system message, tool definitions)
# as cacheable so that repeated calls reuse the cached tokens at 0.1x read cost.
#
# Prerequisites:
# - Set ANTHROPIC_API_KEY and ANTHROPIC_CHAT_MODEL_ID in your environment or a .env file.
# - Model must support caching (claude-haiku-4-5, claude-sonnet-4-x, claude-opus-4-x).
# - Minimum tokens to activate cache: 4,096 (Haiku), 1,024 (Sonnet/Opus).
#
# Run:
# uv run python samples/concepts/caching/anthropic_prompt_caching.py

import asyncio

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.anthropic import (
AnthropicCacheSettings,
AnthropicChatCompletion,
AnthropicChatPromptExecutionSettings,
)
from semantic_kernel.contents import ChatHistory

# A long system prompt that exceeds the minimum token threshold for caching.
# In production this would typically be a large instruction set, persona, or
# document that stays the same across many turns.
SYSTEM_PROMPT = (
"""
You are an expert software engineer specializing in Python and distributed systems.
You provide precise, production-quality answers. When writing code you follow these rules:
- Use type hints throughout.
- Prefer composition over inheritance.
- Write small, single-purpose functions.
- Handle errors explicitly; never silence exceptions.
- Include a brief docstring only when the intent is non-obvious.
- Use async/await for all I/O-bound operations.
- Prefer dataclasses or Pydantic models for structured data.

You are also familiar with the following internal guidelines:
- All public APIs must be versioned.
- Services communicate over gRPC with Protobuf schemas checked into the repo.
- Secrets are injected at runtime via environment variables; never hardcoded.
- Observability: every service emits structured JSON logs and OpenTelemetry traces.
- Deployments use Kubernetes with Helm charts; no raw manifests.

When asked to review code, structure your response as:
1. Summary (1-2 sentences)
2. Issues (bulleted, severity labeled)
3. Suggested fix (code block if applicable)
"""
* 3
) # repeat to ensure we comfortably exceed the 1,024-token minimum


async def chat_with_caching() -> None:
"""Run a multi-turn chat with prompt caching enabled on the system message."""
kernel = Kernel()

service = AnthropicChatCompletion(service_id="anthropic")
kernel.add_service(service)

# AnthropicCacheSettings.on() enables caching for both the system message and
# tool definitions. Use .system() or .tools() to cache only one section.
# Use .long() for 1-hour TTL when calls are infrequent.
settings = AnthropicChatPromptExecutionSettings(
service_id="anthropic",
max_tokens=512,
cache=AnthropicCacheSettings.on(),
)

chat_history = ChatHistory(system_message=SYSTEM_PROMPT)

questions = [
"What is the difference between asyncio.gather and asyncio.TaskGroup?",
"When would you choose gRPC over REST for an internal service?",
"How do you structure a Pydantic settings class for a twelve-factor app?",
]

print("Anthropic Prompt Caching Demo")
print("=" * 50)
print("System prompt is marked for caching. The first call writes the cache;")
print("subsequent calls read from it at 0.1x token cost.\n")

for i, question in enumerate(questions, start=1):
print(f"Turn {i}: {question}")
chat_history.add_user_message(question)

response = await service.get_chat_message_content(
chat_history=chat_history,
settings=settings,
)
if response:
print(f"Assistant: {response}\n")
chat_history.add_message(response)


if __name__ == "__main__":
asyncio.run(chat_with_caching())
2 changes: 2 additions & 0 deletions python/semantic_kernel/connectors/ai/anthropic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# Copyright (c) Microsoft. All rights reserved.

from semantic_kernel.connectors.ai.anthropic.prompt_execution_settings.anthropic_prompt_execution_settings import (
AnthropicCacheSettings,
AnthropicChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.anthropic.services.anthropic_chat_completion import AnthropicChatCompletion

__all__ = [
"AnthropicCacheSettings",
"AnthropicChatCompletion",
"AnthropicChatPromptExecutionSettings",
]
Original file line number Diff line number Diff line change
@@ -1,17 +1,110 @@
# Copyright (c) Microsoft. All rights reserved.

import logging
from typing import Annotated, Any
from typing import Annotated, Any, ClassVar, Literal

from pydantic import Field, model_validator

from semantic_kernel.connectors.ai.function_choice_type import FunctionChoiceType
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.exceptions import ServiceInvalidExecutionSettingsError
from semantic_kernel.kernel_pydantic import KernelBaseSettings

logger = logging.getLogger(__name__)


class AnthropicCacheSettings(KernelBaseSettings):
"""Configuration for Anthropic prompt caching.

Controls which parts of the request receive cache_control injection.
Settings are loaded from environment variables with the prefix 'ANTHROPIC_CACHE_',
then from a .env file, then from defaults. Explicit constructor arguments always win.

Environment variables (prefix 'ANTHROPIC_CACHE_'):
- ANTHROPIC_CACHE_ENABLED — master switch, bool (default: false)
- ANTHROPIC_CACHE_INCLUDE_SYSTEM — cache system message, bool (default: false)
- ANTHROPIC_CACHE_INCLUDE_TOOLS — cache tool definitions, bool (default: false)
- ANTHROPIC_CACHE_TTL — cache TTL, "5m" or "1h" (default: "5m")

Anthropic minimum token thresholds for cache activation:
- claude-haiku-4-5 : 4,096 tokens
- claude-sonnet-4-x: 1,024 tokens
- claude-opus-4-x : 1,024 tokens

TTL options:
- "5m": ephemeral 5-minute cache (1.25x write cost, 0.1x read cost)
- "1h": extended 1-hour cache (2x write cost, 0.1x read cost)

Use the classmethods for common configurations::

AnthropicCacheSettings.on() # enable system + tools caching
AnthropicCacheSettings.off() # disable all caching (default)
AnthropicCacheSettings.system() # cache system message only
AnthropicCacheSettings.tools() # cache tool definitions only
"""

env_prefix: ClassVar[str] = "ANTHROPIC_CACHE_"

enabled: Annotated[
bool,
Field(description="Master switch — disabling skips all cache_control injection regardless of other flags."),
] = False
include_system: Annotated[
bool,
Field(description="Inject cache_control on the system message content block."),
] = False
include_tools: Annotated[
bool,
Field(description="Inject cache_control on the last tool definition, caching the entire tools array prefix."),
] = False
ttl: Annotated[
Literal["5m", "1h"],
Field(description="Cache TTL. '5m' = 5-minute ephemeral (default). '1h' = 1-hour extended."),
] = "5m"

def _cache_control(self) -> dict[str, Any]:
"""Return the cache_control block for the configured TTL."""
if self.ttl == "1h":
return {"type": "ephemeral", "ttl": "1h"}
return {"type": "ephemeral"}

@classmethod
def on(cls, ttl: Literal["5m", "1h"] = "5m") -> "AnthropicCacheSettings":
"""Enable caching for all supported request sections (system + tools)."""
return cls(enabled=True, include_system=True, include_tools=True, ttl=ttl)

@classmethod
def off(cls) -> "AnthropicCacheSettings":
"""Disable all cache_control injection."""
return cls(enabled=False)

@classmethod
def system(cls, ttl: Literal["5m", "1h"] = "5m") -> "AnthropicCacheSettings":
"""Enable caching for the system message only."""
return cls(enabled=True, include_system=True, include_tools=False, ttl=ttl)

@classmethod
def tools(cls, ttl: Literal["5m", "1h"] = "5m") -> "AnthropicCacheSettings":
"""Enable caching for tool definitions only."""
return cls(enabled=True, include_system=False, include_tools=True, ttl=ttl)

@classmethod
def short(cls) -> "AnthropicCacheSettings":
"""5-minute TTL. Use for tight agentic loops where the same prompt repeats within minutes.

Write cost: 1.25x. Read cost: 0.1x. Breaks even after a single cache hit.
"""
return cls(enabled=True, include_system=True, include_tools=True, ttl="5m")

@classmethod
def long(cls) -> "AnthropicCacheSettings":
"""1-hour TTL. Use for batch jobs or scheduled tasks with long gaps between calls.

Write cost: 2x. Read cost: 0.1x. Needs at least 2 cache hits to break even.
"""
return cls(enabled=True, include_system=True, include_tools=True, ttl="1h")


class AnthropicPromptExecutionSettings(PromptExecutionSettings):
"""Common request settings for Anthropic services."""

Expand All @@ -23,7 +116,7 @@ class AnthropicChatPromptExecutionSettings(AnthropicPromptExecutionSettings):

messages: list[dict[str, Any]] | None = None
stream: bool | None = None
system: str | None = None
system: str | list[dict[str, Any]] | None = None
max_tokens: Annotated[int, Field(gt=0)] = 1024
temperature: Annotated[float | None, Field(ge=0.0, le=2.0)] = None
stop_sequences: list[str] | None = None
Expand All @@ -43,6 +136,13 @@ class AnthropicChatPromptExecutionSettings(AnthropicPromptExecutionSettings):
description="Do not set this manually. It is set by the service based on the function choice configuration."
),
] = None
cache: Annotated[
AnthropicCacheSettings,
Field(
description="Prompt caching configuration. Disabled by default.",
exclude=True,
),
] = Field(default_factory=AnthropicCacheSettings)

@model_validator(mode="after")
def validate_tool_choice(self) -> "AnthropicChatPromptExecutionSettings":
Expand All @@ -53,3 +153,26 @@ def validate_tool_choice(self) -> "AnthropicChatPromptExecutionSettings":
raise ServiceInvalidExecutionSettingsError("Tool choice 'none' is not supported by Anthropic.")

return self

def prepare_settings_dict(self, **kwargs: Any) -> dict[str, Any]:
"""Prepare the settings dictionary, injecting cache_control blocks when caching is enabled."""
data = super().prepare_settings_dict(**kwargs)

if not self.cache.enabled:
return data

cache_control = self.cache._cache_control()

if self.cache.include_system:
system = data.get("system")
if isinstance(system, str) and system:
data["system"] = [{"type": "text", "text": system, "cache_control": cache_control}]
elif isinstance(system, list) and system and "cache_control" not in system[-1]:
data["system"] = [*system[:-1], {**system[-1], "cache_control": cache_control}]

if self.cache.include_tools:
tools: list[dict[str, Any]] | None = data.get("tools")
if tools and "cache_control" not in tools[-1]:
data["tools"] = [*tools[:-1], {**tools[-1], "cache_control": cache_control}]

return data
Loading
Loading