diff --git a/python/samples/concepts/caching/anthropic_prompt_caching.py b/python/samples/concepts/caching/anthropic_prompt_caching.py new file mode 100644 index 000000000000..4015064f1de2 --- /dev/null +++ b/python/samples/concepts/caching/anthropic_prompt_caching.py @@ -0,0 +1,99 @@ +# Copyright (c) Microsoft. All rights reserved. + +# This sample demonstrates Anthropic prompt caching with Semantic Kernel. +# Prompt caching lets you mark parts of a request (system message, tool definitions) +# as cacheable so that repeated calls reuse the cached tokens at 0.1x read cost. +# +# Prerequisites: +# - Set ANTHROPIC_API_KEY and ANTHROPIC_CHAT_MODEL_ID in your environment or a .env file. +# - Model must support caching (claude-haiku-4-5, claude-sonnet-4-x, claude-opus-4-x). +# - Minimum tokens to activate cache: 4,096 (Haiku), 1,024 (Sonnet/Opus). +# +# Run: +# uv run python samples/concepts/caching/anthropic_prompt_caching.py + +import asyncio + +from semantic_kernel import Kernel +from semantic_kernel.connectors.ai.anthropic import ( + AnthropicCacheSettings, + AnthropicChatCompletion, + AnthropicChatPromptExecutionSettings, +) +from semantic_kernel.contents import ChatHistory + +# A long system prompt that exceeds the minimum token threshold for caching. +# In production this would typically be a large instruction set, persona, or +# document that stays the same across many turns. +SYSTEM_PROMPT = ( + """ +You are an expert software engineer specializing in Python and distributed systems. +You provide precise, production-quality answers. When writing code you follow these rules: + - Use type hints throughout. + - Prefer composition over inheritance. + - Write small, single-purpose functions. + - Handle errors explicitly; never silence exceptions. + - Include a brief docstring only when the intent is non-obvious. + - Use async/await for all I/O-bound operations. + - Prefer dataclasses or Pydantic models for structured data. + +You are also familiar with the following internal guidelines: + - All public APIs must be versioned. + - Services communicate over gRPC with Protobuf schemas checked into the repo. + - Secrets are injected at runtime via environment variables; never hardcoded. + - Observability: every service emits structured JSON logs and OpenTelemetry traces. + - Deployments use Kubernetes with Helm charts; no raw manifests. + +When asked to review code, structure your response as: + 1. Summary (1-2 sentences) + 2. Issues (bulleted, severity labeled) + 3. Suggested fix (code block if applicable) +""" + * 3 +) # repeat to ensure we comfortably exceed the 1,024-token minimum + + +async def chat_with_caching() -> None: + """Run a multi-turn chat with prompt caching enabled on the system message.""" + kernel = Kernel() + + service = AnthropicChatCompletion(service_id="anthropic") + kernel.add_service(service) + + # AnthropicCacheSettings.on() enables caching for both the system message and + # tool definitions. Use .system() or .tools() to cache only one section. + # Use .long() for 1-hour TTL when calls are infrequent. + settings = AnthropicChatPromptExecutionSettings( + service_id="anthropic", + max_tokens=512, + cache=AnthropicCacheSettings.on(), + ) + + chat_history = ChatHistory(system_message=SYSTEM_PROMPT) + + questions = [ + "What is the difference between asyncio.gather and asyncio.TaskGroup?", + "When would you choose gRPC over REST for an internal service?", + "How do you structure a Pydantic settings class for a twelve-factor app?", + ] + + print("Anthropic Prompt Caching Demo") + print("=" * 50) + print("System prompt is marked for caching. The first call writes the cache;") + print("subsequent calls read from it at 0.1x token cost.\n") + + for i, question in enumerate(questions, start=1): + print(f"Turn {i}: {question}") + chat_history.add_user_message(question) + + response = await service.get_chat_message_content( + chat_history=chat_history, + settings=settings, + ) + if response: + print(f"Assistant: {response}\n") + chat_history.add_message(response) + + +if __name__ == "__main__": + asyncio.run(chat_with_caching()) diff --git a/python/semantic_kernel/connectors/ai/anthropic/__init__.py b/python/semantic_kernel/connectors/ai/anthropic/__init__.py index c5d96ddd147f..93521cf393b2 100644 --- a/python/semantic_kernel/connectors/ai/anthropic/__init__.py +++ b/python/semantic_kernel/connectors/ai/anthropic/__init__.py @@ -1,11 +1,13 @@ # Copyright (c) Microsoft. All rights reserved. from semantic_kernel.connectors.ai.anthropic.prompt_execution_settings.anthropic_prompt_execution_settings import ( + AnthropicCacheSettings, AnthropicChatPromptExecutionSettings, ) from semantic_kernel.connectors.ai.anthropic.services.anthropic_chat_completion import AnthropicChatCompletion __all__ = [ + "AnthropicCacheSettings", "AnthropicChatCompletion", "AnthropicChatPromptExecutionSettings", ] diff --git a/python/semantic_kernel/connectors/ai/anthropic/prompt_execution_settings/anthropic_prompt_execution_settings.py b/python/semantic_kernel/connectors/ai/anthropic/prompt_execution_settings/anthropic_prompt_execution_settings.py index c18fcb30c732..e175d2240a02 100644 --- a/python/semantic_kernel/connectors/ai/anthropic/prompt_execution_settings/anthropic_prompt_execution_settings.py +++ b/python/semantic_kernel/connectors/ai/anthropic/prompt_execution_settings/anthropic_prompt_execution_settings.py @@ -1,17 +1,110 @@ # Copyright (c) Microsoft. All rights reserved. import logging -from typing import Annotated, Any +from typing import Annotated, Any, ClassVar, Literal from pydantic import Field, model_validator from semantic_kernel.connectors.ai.function_choice_type import FunctionChoiceType from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings from semantic_kernel.exceptions import ServiceInvalidExecutionSettingsError +from semantic_kernel.kernel_pydantic import KernelBaseSettings logger = logging.getLogger(__name__) +class AnthropicCacheSettings(KernelBaseSettings): + """Configuration for Anthropic prompt caching. + + Controls which parts of the request receive cache_control injection. + Settings are loaded from environment variables with the prefix 'ANTHROPIC_CACHE_', + then from a .env file, then from defaults. Explicit constructor arguments always win. + + Environment variables (prefix 'ANTHROPIC_CACHE_'): + - ANTHROPIC_CACHE_ENABLED — master switch, bool (default: false) + - ANTHROPIC_CACHE_INCLUDE_SYSTEM — cache system message, bool (default: false) + - ANTHROPIC_CACHE_INCLUDE_TOOLS — cache tool definitions, bool (default: false) + - ANTHROPIC_CACHE_TTL — cache TTL, "5m" or "1h" (default: "5m") + + Anthropic minimum token thresholds for cache activation: + - claude-haiku-4-5 : 4,096 tokens + - claude-sonnet-4-x: 1,024 tokens + - claude-opus-4-x : 1,024 tokens + + TTL options: + - "5m": ephemeral 5-minute cache (1.25x write cost, 0.1x read cost) + - "1h": extended 1-hour cache (2x write cost, 0.1x read cost) + + Use the classmethods for common configurations:: + + AnthropicCacheSettings.on() # enable system + tools caching + AnthropicCacheSettings.off() # disable all caching (default) + AnthropicCacheSettings.system() # cache system message only + AnthropicCacheSettings.tools() # cache tool definitions only + """ + + env_prefix: ClassVar[str] = "ANTHROPIC_CACHE_" + + enabled: Annotated[ + bool, + Field(description="Master switch — disabling skips all cache_control injection regardless of other flags."), + ] = False + include_system: Annotated[ + bool, + Field(description="Inject cache_control on the system message content block."), + ] = False + include_tools: Annotated[ + bool, + Field(description="Inject cache_control on the last tool definition, caching the entire tools array prefix."), + ] = False + ttl: Annotated[ + Literal["5m", "1h"], + Field(description="Cache TTL. '5m' = 5-minute ephemeral (default). '1h' = 1-hour extended."), + ] = "5m" + + def _cache_control(self) -> dict[str, Any]: + """Return the cache_control block for the configured TTL.""" + if self.ttl == "1h": + return {"type": "ephemeral", "ttl": "1h"} + return {"type": "ephemeral"} + + @classmethod + def on(cls, ttl: Literal["5m", "1h"] = "5m") -> "AnthropicCacheSettings": + """Enable caching for all supported request sections (system + tools).""" + return cls(enabled=True, include_system=True, include_tools=True, ttl=ttl) + + @classmethod + def off(cls) -> "AnthropicCacheSettings": + """Disable all cache_control injection.""" + return cls(enabled=False) + + @classmethod + def system(cls, ttl: Literal["5m", "1h"] = "5m") -> "AnthropicCacheSettings": + """Enable caching for the system message only.""" + return cls(enabled=True, include_system=True, include_tools=False, ttl=ttl) + + @classmethod + def tools(cls, ttl: Literal["5m", "1h"] = "5m") -> "AnthropicCacheSettings": + """Enable caching for tool definitions only.""" + return cls(enabled=True, include_system=False, include_tools=True, ttl=ttl) + + @classmethod + def short(cls) -> "AnthropicCacheSettings": + """5-minute TTL. Use for tight agentic loops where the same prompt repeats within minutes. + + Write cost: 1.25x. Read cost: 0.1x. Breaks even after a single cache hit. + """ + return cls(enabled=True, include_system=True, include_tools=True, ttl="5m") + + @classmethod + def long(cls) -> "AnthropicCacheSettings": + """1-hour TTL. Use for batch jobs or scheduled tasks with long gaps between calls. + + Write cost: 2x. Read cost: 0.1x. Needs at least 2 cache hits to break even. + """ + return cls(enabled=True, include_system=True, include_tools=True, ttl="1h") + + class AnthropicPromptExecutionSettings(PromptExecutionSettings): """Common request settings for Anthropic services.""" @@ -23,7 +116,7 @@ class AnthropicChatPromptExecutionSettings(AnthropicPromptExecutionSettings): messages: list[dict[str, Any]] | None = None stream: bool | None = None - system: str | None = None + system: str | list[dict[str, Any]] | None = None max_tokens: Annotated[int, Field(gt=0)] = 1024 temperature: Annotated[float | None, Field(ge=0.0, le=2.0)] = None stop_sequences: list[str] | None = None @@ -43,6 +136,13 @@ class AnthropicChatPromptExecutionSettings(AnthropicPromptExecutionSettings): description="Do not set this manually. It is set by the service based on the function choice configuration." ), ] = None + cache: Annotated[ + AnthropicCacheSettings, + Field( + description="Prompt caching configuration. Disabled by default.", + exclude=True, + ), + ] = Field(default_factory=AnthropicCacheSettings) @model_validator(mode="after") def validate_tool_choice(self) -> "AnthropicChatPromptExecutionSettings": @@ -53,3 +153,26 @@ def validate_tool_choice(self) -> "AnthropicChatPromptExecutionSettings": raise ServiceInvalidExecutionSettingsError("Tool choice 'none' is not supported by Anthropic.") return self + + def prepare_settings_dict(self, **kwargs: Any) -> dict[str, Any]: + """Prepare the settings dictionary, injecting cache_control blocks when caching is enabled.""" + data = super().prepare_settings_dict(**kwargs) + + if not self.cache.enabled: + return data + + cache_control = self.cache._cache_control() + + if self.cache.include_system: + system = data.get("system") + if isinstance(system, str) and system: + data["system"] = [{"type": "text", "text": system, "cache_control": cache_control}] + elif isinstance(system, list) and system and "cache_control" not in system[-1]: + data["system"] = [*system[:-1], {**system[-1], "cache_control": cache_control}] + + if self.cache.include_tools: + tools: list[dict[str, Any]] | None = data.get("tools") + if tools and "cache_control" not in tools[-1]: + data["tools"] = [*tools[:-1], {**tools[-1], "cache_control": cache_control}] + + return data diff --git a/python/tests/unit/connectors/ai/anthropic/test_anthropic_request_settings.py b/python/tests/unit/connectors/ai/anthropic/test_anthropic_request_settings.py index 531823281ae2..885915d1e3ae 100644 --- a/python/tests/unit/connectors/ai/anthropic/test_anthropic_request_settings.py +++ b/python/tests/unit/connectors/ai/anthropic/test_anthropic_request_settings.py @@ -3,6 +3,7 @@ import pytest from semantic_kernel.connectors.ai.anthropic.prompt_execution_settings.anthropic_prompt_execution_settings import ( + AnthropicCacheSettings, AnthropicChatPromptExecutionSettings, ) from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior @@ -127,3 +128,252 @@ def test_tool_choice_none(): }, function_choice_behavior=FunctionChoiceBehavior.NoneInvoke(), ) + + +# region AnthropicCacheSettings + + +def test_cache_settings_default_is_off(): + settings = AnthropicCacheSettings() + assert settings.enabled is False + assert settings.include_system is False + assert settings.include_tools is False + assert settings.ttl == "5m" + + +def test_cache_settings_on(): + settings = AnthropicCacheSettings.on() + assert settings.enabled is True + assert settings.include_system is True + assert settings.include_tools is True + assert settings.ttl == "5m" + + +def test_cache_settings_on_with_1h_ttl(): + settings = AnthropicCacheSettings.on(ttl="1h") + assert settings.enabled is True + assert settings.ttl == "1h" + + +def test_cache_settings_off(): + settings = AnthropicCacheSettings.off() + assert settings.enabled is False + + +def test_cache_settings_system_only(): + settings = AnthropicCacheSettings.system() + assert settings.enabled is True + assert settings.include_system is True + assert settings.include_tools is False + + +def test_cache_settings_tools_only(): + settings = AnthropicCacheSettings.tools() + assert settings.enabled is True + assert settings.include_system is False + assert settings.include_tools is True + + +def test_cache_control_5m_via_prepare(): + """5m TTL emits ephemeral block without a ttl key.""" + settings = AnthropicChatPromptExecutionSettings( + system="Hello.", + cache=AnthropicCacheSettings.on(ttl="5m"), + ) + data = settings.prepare_settings_dict() + assert data["system"][0]["cache_control"] == {"type": "ephemeral"} + + +def test_cache_control_1h_via_prepare(): + """1h TTL emits ephemeral block with ttl string '1h'.""" + settings = AnthropicChatPromptExecutionSettings( + system="Hello.", + cache=AnthropicCacheSettings.on(ttl="1h"), + ) + data = settings.prepare_settings_dict() + assert data["system"][0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"} + + +def test_cache_settings_short(): + settings = AnthropicCacheSettings.short() + assert settings.enabled is True + assert settings.include_system is True + assert settings.include_tools is True + assert settings.ttl == "5m" + + +def test_cache_settings_long(): + settings = AnthropicCacheSettings.long() + assert settings.enabled is True + assert settings.include_system is True + assert settings.include_tools is True + assert settings.ttl == "1h" + + +# endregion + +# region prepare_settings_dict with caching + + +def test_prepare_settings_dict_cache_off_no_injection(): + settings = AnthropicChatPromptExecutionSettings( + system="You are a helpful assistant.", + tools=[{"name": "search", "description": "Search the web"}], + cache=AnthropicCacheSettings.off(), + ) + data = settings.prepare_settings_dict() + assert data["system"] == "You are a helpful assistant." + assert "cache_control" not in data["tools"][-1] + + +def test_prepare_settings_dict_include_system_only(): + settings = AnthropicChatPromptExecutionSettings( + system="You are a helpful assistant.", + cache=AnthropicCacheSettings.system(), + ) + data = settings.prepare_settings_dict() + assert isinstance(data["system"], list) + assert data["system"] == [ + {"type": "text", "text": "You are a helpful assistant.", "cache_control": {"type": "ephemeral"}} + ] + + +def test_prepare_settings_dict_include_tools_only(): + tools = [ + {"name": "tool_a", "description": "Tool A"}, + {"name": "tool_b", "description": "Tool B"}, + ] + settings = AnthropicChatPromptExecutionSettings( + tools=tools, + cache=AnthropicCacheSettings.tools(), + ) + data = settings.prepare_settings_dict() + assert "cache_control" not in data["tools"][0] + assert data["tools"][-1]["cache_control"] == {"type": "ephemeral"} + # original tools list must not be mutated + assert "cache_control" not in tools[-1] + + +def test_prepare_settings_dict_cache_on_system_and_tools(): + tools = [{"name": "search", "description": "Search the web"}] + settings = AnthropicChatPromptExecutionSettings( + system="You are a helpful assistant.", + tools=tools, + cache=AnthropicCacheSettings.on(), + ) + data = settings.prepare_settings_dict() + assert isinstance(data["system"], list) + assert data["system"][0]["cache_control"] == {"type": "ephemeral"} + assert data["tools"][-1]["cache_control"] == {"type": "ephemeral"} + + +def test_prepare_settings_dict_cache_on_1h_ttl(): + tools = [{"name": "search", "description": "Search the web"}] + settings = AnthropicChatPromptExecutionSettings( + system="You are a helpful assistant.", + tools=tools, + cache=AnthropicCacheSettings.on(ttl="1h"), + ) + data = settings.prepare_settings_dict() + assert data["system"][0]["cache_control"] == {"type": "ephemeral", "ttl": "1h"} + assert data["tools"][-1]["cache_control"] == {"type": "ephemeral", "ttl": "1h"} + + +def test_prepare_settings_dict_include_system_already_list(): + """When system is pre-structured as list[dict], cache_control is injected on the last block.""" + system_blocks = [ + {"type": "text", "text": "First block."}, + {"type": "text", "text": "Second block."}, + ] + settings = AnthropicChatPromptExecutionSettings( + system=system_blocks, + cache=AnthropicCacheSettings.system(), + ) + data = settings.prepare_settings_dict() + assert isinstance(data["system"], list) + assert "cache_control" not in data["system"][0] + assert data["system"][-1]["cache_control"] == {"type": "ephemeral"} + # original list must not be mutated + assert "cache_control" not in system_blocks[-1] + + +def test_prepare_settings_dict_include_system_empty_string_no_injection(): + """Empty system string should not be wrapped in a cache block.""" + settings = AnthropicChatPromptExecutionSettings( + system="", + cache=AnthropicCacheSettings.system(), + ) + data = settings.prepare_settings_dict() + # empty string — no injection expected + assert not isinstance(data.get("system"), list) + + +def test_prepare_settings_dict_include_tools_empty_no_injection(): + """No tools present — include_tools flag should be a no-op.""" + settings = AnthropicChatPromptExecutionSettings( + cache=AnthropicCacheSettings.tools(), + ) + data = settings.prepare_settings_dict() + assert data.get("tools") is None + + +def test_prepare_settings_dict_cache_excluded_from_serialization(): + """The cache field must not appear in the serialized API payload.""" + settings = AnthropicChatPromptExecutionSettings(cache=AnthropicCacheSettings.on()) + data = settings.prepare_settings_dict() + assert "cache" not in data + + +def test_prepare_settings_dict_existing_cache_control_not_overwritten(): + """cache_control already present on the last tool/system block must not be clobbered.""" + existing_ctrl = {"type": "ephemeral", "ttl": "1h"} + tools = [{"name": "t", "description": "d", "cache_control": existing_ctrl}] + settings = AnthropicChatPromptExecutionSettings( + tools=tools, + cache=AnthropicCacheSettings.tools(ttl="5m"), + ) + data = settings.prepare_settings_dict() + assert data["tools"][-1]["cache_control"] == existing_ctrl + + +# endregion + +# region AnthropicCacheSettings — environment variable support + + +def test_cache_settings_from_env(monkeypatch): + """Settings are populated from ANTHROPIC_CACHE_* env vars.""" + monkeypatch.setenv("ANTHROPIC_CACHE_ENABLED", "true") + monkeypatch.setenv("ANTHROPIC_CACHE_INCLUDE_SYSTEM", "true") + monkeypatch.setenv("ANTHROPIC_CACHE_INCLUDE_TOOLS", "false") + monkeypatch.setenv("ANTHROPIC_CACHE_TTL", "1h") + settings = AnthropicCacheSettings() + assert settings.enabled is True + assert settings.include_system is True + assert settings.include_tools is False + assert settings.ttl == "1h" + + +def test_cache_settings_explicit_overrides_env(monkeypatch): + """Explicit constructor arguments take priority over environment variables.""" + monkeypatch.setenv("ANTHROPIC_CACHE_ENABLED", "true") + monkeypatch.setenv("ANTHROPIC_CACHE_TTL", "1h") + settings = AnthropicCacheSettings(enabled=False, ttl="5m") + assert settings.enabled is False + assert settings.ttl == "5m" + + +def test_cache_settings_env_disabled_by_default(monkeypatch): + """With no env vars set, cache is disabled by default.""" + for key in ( + "ANTHROPIC_CACHE_ENABLED", + "ANTHROPIC_CACHE_INCLUDE_SYSTEM", + "ANTHROPIC_CACHE_INCLUDE_TOOLS", + "ANTHROPIC_CACHE_TTL", + ): + monkeypatch.delenv(key, raising=False) + settings = AnthropicCacheSettings() + assert settings.enabled is False + + +# endregion