Skip to content

Commit ec189ab

Browse files
authored
feat: add caching support for ChatAnthropic() (#215)
* Add caching support for Claude * Update changelog * Move caching section in docstring
1 parent 985bf34 commit ec189ab

File tree

2 files changed

+96
-7
lines changed

2 files changed

+96
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1818
* Comprehensive documentation in the [Evals guide](https://posit-dev.github.io/chatlas/misc/evals.html).
1919
* `ChatOpenAI()` (and `ChatAzureOpenAI()`) gain access to latest models, built-in tools, etc. as a result of moving to the new [Responses API](https://platform.openai.com/docs/api-reference/responses). (#192)
2020
* Added rudimentary support for a new `ContentThinking` type. (#192)
21+
* `ChatAnthropic()` and `ChatBedrockAnthropic()` gain new `cache` parameter to control caching. By default it is set to "5m". This should (on average) reduce the cost of your chats. (#215)
2122

2223
### Changes
2324

chatlas/_provider_anthropic.py

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
ToolParam,
4545
ToolUseBlock,
4646
)
47+
from anthropic.types.cache_control_ephemeral_param import CacheControlEphemeralParam
4748
from anthropic.types.document_block_param import DocumentBlockParam
4849
from anthropic.types.image_block_param import ImageBlockParam
4950
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
@@ -73,6 +74,7 @@ def ChatAnthropic(
7374
model: "Optional[ModelParam]" = None,
7475
api_key: Optional[str] = None,
7576
max_tokens: int = 4096,
77+
cache: Literal["5m", "1h", "none"] = "5m",
7678
kwargs: Optional["ChatClientArgs"] = None,
7779
) -> Chat["SubmitInputArgs", Message]:
7880
"""
@@ -125,6 +127,10 @@ def ChatAnthropic(
125127
variable.
126128
max_tokens
127129
Maximum number of tokens to generate before stopping.
130+
cache
131+
How long to cache inputs? Defaults to "5m" (five minutes).
132+
Set to "none" to disable caching or "1h" to cache for one hour.
133+
See the Caching section for details.
128134
kwargs
129135
Additional arguments to pass to the `anthropic.Anthropic()` client
130136
constructor.
@@ -169,6 +175,46 @@ def ChatAnthropic(
169175
```shell
170176
export ANTHROPIC_API_KEY=...
171177
```
178+
179+
Caching
180+
-------
181+
182+
Caching with Claude is a bit more complicated than other providers but we
183+
believe that on average it will save you both money and time, so we have
184+
enabled it by default. With other providers, like OpenAI and Google,
185+
you only pay for cache reads, which cost 10% of the normal price. With
186+
Claude, you also pay for cache writes, which cost 125% of the normal price
187+
for 5 minute caching and 200% of the normal price for 1 hour caching.
188+
189+
How does this affect the total cost of a conversation? Imagine the first
190+
turn sends 1000 input tokens and receives 200 output tokens. The second
191+
turn must first send both the input and output from the previous turn
192+
(1200 tokens). It then sends a further 1000 tokens and receives 200 tokens
193+
back.
194+
195+
To compare the prices of these two approaches we can ignore the cost of
196+
output tokens, because they are the same for both. How much will the input
197+
tokens cost? If we don't use caching, we send 1000 tokens in the first turn
198+
and 2200 (1000 + 200 + 1000) tokens in the second turn for a total of 3200
199+
tokens. If we use caching, we'll send (the equivalent of) 1000 * 1.25 = 1250
200+
tokens in the first turn. In the second turn, 1000 of the input tokens will
201+
be cached so the total cost is 1000 * 0.1 + (200 + 1000) * 1.25 = 1600
202+
tokens. That makes a total of 2850 tokens, i.e. 11% fewer tokens,
203+
decreasing the overall cost.
204+
205+
Obviously, the details will vary from conversation to conversation, but
206+
if you have a large system prompt that you re-use many times you should
207+
expect to see larger savings. You can see exactly how many input and
208+
cache input tokens each turn uses, along with the total cost,
209+
with `chat.get_tokens()`. If you don't see savings for your use case, you can
210+
suppress caching with `cache="none"`.
211+
212+
Note: Claude will only cache longer prompts, with caching requiring at least
213+
1024-4096 tokens, depending on the model. So don't be surprised if you
214+
don't see any differences with caching if you have a short prompt.
215+
216+
See all the details at
217+
<https://docs.claude.com/en/docs/build-with-claude/prompt-caching>.
172218
"""
173219

174220
if model is None:
@@ -179,6 +225,7 @@ def ChatAnthropic(
179225
api_key=api_key,
180226
model=model,
181227
max_tokens=max_tokens,
228+
cache=cache,
182229
kwargs=kwargs,
183230
),
184231
system_prompt=system_prompt,
@@ -195,6 +242,7 @@ def __init__(
195242
model: str,
196243
api_key: Optional[str] = None,
197244
name: str = "Anthropic",
245+
cache: Literal["5m", "1h", "none"] = "5m",
198246
kwargs: Optional["ChatClientArgs"] = None,
199247
):
200248
super().__init__(name=name, model=model)
@@ -206,6 +254,7 @@ def __init__(
206254
"You can install it with 'pip install anthropic'."
207255
)
208256
self._max_tokens = max_tokens
257+
self._cache: Literal["5m", "1h", "none"] = cache
209258

210259
kwargs_full: "ChatClientArgs" = {
211260
"api_key": api_key,
@@ -365,7 +414,13 @@ def _structured_tool_call(**kwargs: Any):
365414

366415
if "system" not in kwargs_full:
367416
if len(turns) > 0 and turns[0].role == "system":
368-
kwargs_full["system"] = turns[0].text
417+
sys_param: "TextBlockParam" = {
418+
"type": "text",
419+
"text": turns[0].text,
420+
}
421+
if self._cache_control():
422+
sys_param["cache_control"] = self._cache_control()
423+
kwargs_full["system"] = [sys_param]
369424

370425
return kwargs_full
371426

@@ -418,11 +473,16 @@ def value_turn(self, completion, has_data_model) -> Turn:
418473

419474
def value_tokens(self, completion):
420475
usage = completion.usage
421-
# N.B. Currently, Anthropic doesn't cache by default and we currently do not support
422-
# manual caching in chatlas. Note also that this only tracks reads, NOT writes, which
423-
# have their own cost. To track that properly, we would need another caching category and per-token cost.
476+
input_tokens = completion.usage.input_tokens
477+
478+
# Account for cache writes by adjusting input tokens
479+
# Cache writes cost 125% for 5m and 200% for 1h
480+
# https://docs.claude.com/en/docs/build-with-claude/prompt-caching
481+
cache_input = usage.cache_creation_input_tokens or 0
482+
cache_mult = 2.0 if self._cache == "1h" else 1.25
483+
424484
return (
425-
completion.usage.input_tokens,
485+
input_tokens + int(cache_input * cache_mult),
426486
completion.usage.output_tokens,
427487
usage.cache_read_input_tokens if usage.cache_read_input_tokens else 0,
428488
)
@@ -510,13 +570,21 @@ def supported_model_params(self) -> set[StandardModelParamNames]:
510570

511571
def _as_message_params(self, turns: list[Turn]) -> list["MessageParam"]:
512572
messages: list["MessageParam"] = []
513-
for turn in turns:
573+
for i, turn in enumerate(turns):
514574
if turn.role == "system":
515575
continue # system prompt passed as separate arg
516576
if turn.role not in ["user", "assistant"]:
517577
raise ValueError(f"Unknown role {turn.role}")
518578

519579
content = [self._as_content_block(c) for c in turn.contents]
580+
581+
# Add cache control to the last content block in the last turn
582+
# https://docs.claude.com/en/docs/build-with-claude/prompt-caching#how-automatic-prefix-checking-works
583+
is_last_turn = i == len(turns) - 1
584+
if is_last_turn and len(content) > 0:
585+
if self._cache_control():
586+
content[-1]["cache_control"] = self._cache_control()
587+
520588
role = "user" if turn.role == "user" else "assistant"
521589
messages.append({"role": role, "content": content})
522590
return messages
@@ -744,11 +812,20 @@ def batch_result_turn(self, result, has_data_model: bool = False) -> Turn | None
744812
message = result.result.message
745813
return self._as_turn(message, has_data_model)
746814

815+
def _cache_control(self) -> "Optional[CacheControlEphemeralParam]":
816+
if self._cache == "none":
817+
return None
818+
return {
819+
"type": "ephemeral",
820+
"ttl": self._cache,
821+
}
822+
747823

748824
def ChatBedrockAnthropic(
749825
*,
750826
model: Optional[str] = None,
751827
max_tokens: int = 4096,
828+
cache: Literal["5m", "1h", "none"] = "5m",
752829
aws_secret_key: Optional[str] = None,
753830
aws_access_key: Optional[str] = None,
754831
aws_region: Optional[str] = None,
@@ -804,6 +881,10 @@ def ChatBedrockAnthropic(
804881
The model to use for the chat.
805882
max_tokens
806883
Maximum number of tokens to generate before stopping.
884+
cache
885+
How long to cache inputs? Defaults to "5m" (five minutes).
886+
Set to "none" to disable caching or "1h" to cache for one hour.
887+
See the Caching section of `ChatAnthropic` for details.
807888
aws_secret_key
808889
The AWS secret key to use for authentication.
809890
aws_access_key
@@ -885,6 +966,7 @@ def ChatBedrockAnthropic(
885966
provider=AnthropicBedrockProvider(
886967
model=model,
887968
max_tokens=max_tokens,
969+
cache=cache,
888970
aws_secret_key=aws_secret_key,
889971
aws_access_key=aws_access_key,
890972
aws_region=aws_region,
@@ -908,11 +990,17 @@ def __init__(
908990
aws_profile: str | None,
909991
aws_session_token: str | None,
910992
max_tokens: int = 4096,
993+
cache: Literal["5m", "1h", "none"] = "5m",
911994
base_url: str | None,
912995
name: str = "AWS/Bedrock",
913996
kwargs: Optional["ChatBedrockClientArgs"] = None,
914997
):
915-
super().__init__(name=name, model=model, max_tokens=max_tokens)
998+
super().__init__(
999+
name=name,
1000+
model=model,
1001+
max_tokens=max_tokens,
1002+
cache=cache,
1003+
)
9161004

9171005
try:
9181006
from anthropic import AnthropicBedrock, AsyncAnthropicBedrock

0 commit comments

Comments
 (0)