4444 ToolParam ,
4545 ToolUseBlock ,
4646 )
47+ from anthropic .types .cache_control_ephemeral_param import CacheControlEphemeralParam
4748 from anthropic .types .document_block_param import DocumentBlockParam
4849 from anthropic .types .image_block_param import ImageBlockParam
4950 from anthropic .types .message_create_params import MessageCreateParamsNonStreaming
@@ -73,6 +74,7 @@ def ChatAnthropic(
7374 model : "Optional[ModelParam]" = None ,
7475 api_key : Optional [str ] = None ,
7576 max_tokens : int = 4096 ,
77+ cache : Literal ["5m" , "1h" , "none" ] = "5m" ,
7678 kwargs : Optional ["ChatClientArgs" ] = None ,
7779) -> Chat ["SubmitInputArgs" , Message ]:
7880 """
@@ -125,6 +127,10 @@ def ChatAnthropic(
125127 variable.
126128 max_tokens
127129 Maximum number of tokens to generate before stopping.
130+ cache
131+ How long to cache inputs? Defaults to "5m" (five minutes).
132+ Set to "none" to disable caching or "1h" to cache for one hour.
133+ See the Caching section for details.
128134 kwargs
129135 Additional arguments to pass to the `anthropic.Anthropic()` client
130136 constructor.
@@ -169,6 +175,46 @@ def ChatAnthropic(
169175 ```shell
170176 export ANTHROPIC_API_KEY=...
171177 ```
178+
179+ Caching
180+ -------
181+
182+ Caching with Claude is a bit more complicated than other providers but we
183+ believe that on average it will save you both money and time, so we have
184+ enabled it by default. With other providers, like OpenAI and Google,
185+ you only pay for cache reads, which cost 10% of the normal price. With
186+ Claude, you also pay for cache writes, which cost 125% of the normal price
187+ for 5 minute caching and 200% of the normal price for 1 hour caching.
188+
189+ How does this affect the total cost of a conversation? Imagine the first
190+ turn sends 1000 input tokens and receives 200 output tokens. The second
191+ turn must first send both the input and output from the previous turn
192+ (1200 tokens). It then sends a further 1000 tokens and receives 200 tokens
193+ back.
194+
195+ To compare the prices of these two approaches we can ignore the cost of
196+ output tokens, because they are the same for both. How much will the input
197+ tokens cost? If we don't use caching, we send 1000 tokens in the first turn
198+ and 2200 (1000 + 200 + 1000) tokens in the second turn for a total of 3200
199+ tokens. If we use caching, we'll send (the equivalent of) 1000 * 1.25 = 1250
200+ tokens in the first turn. In the second turn, 1000 of the input tokens will
201+ be cached so the total cost is 1000 * 0.1 + (200 + 1000) * 1.25 = 1600
202+ tokens. That makes a total of 2850 tokens, i.e. 11% fewer tokens,
203+ decreasing the overall cost.
204+
205+ Obviously, the details will vary from conversation to conversation, but
206+ if you have a large system prompt that you re-use many times you should
207+ expect to see larger savings. You can see exactly how many input and
208+ cache input tokens each turn uses, along with the total cost,
209+ with `chat.get_tokens()`. If you don't see savings for your use case, you can
210+ suppress caching with `cache="none"`.
211+
212+ Note: Claude will only cache longer prompts, with caching requiring at least
213+ 1024-4096 tokens, depending on the model. So don't be surprised if you
214+ don't see any differences with caching if you have a short prompt.
215+
216+ See all the details at
217+ <https://docs.claude.com/en/docs/build-with-claude/prompt-caching>.
172218 """
173219
174220 if model is None :
@@ -179,6 +225,7 @@ def ChatAnthropic(
179225 api_key = api_key ,
180226 model = model ,
181227 max_tokens = max_tokens ,
228+ cache = cache ,
182229 kwargs = kwargs ,
183230 ),
184231 system_prompt = system_prompt ,
@@ -195,6 +242,7 @@ def __init__(
195242 model : str ,
196243 api_key : Optional [str ] = None ,
197244 name : str = "Anthropic" ,
245+ cache : Literal ["5m" , "1h" , "none" ] = "5m" ,
198246 kwargs : Optional ["ChatClientArgs" ] = None ,
199247 ):
200248 super ().__init__ (name = name , model = model )
@@ -206,6 +254,7 @@ def __init__(
206254 "You can install it with 'pip install anthropic'."
207255 )
208256 self ._max_tokens = max_tokens
257+ self ._cache : Literal ["5m" , "1h" , "none" ] = cache
209258
210259 kwargs_full : "ChatClientArgs" = {
211260 "api_key" : api_key ,
@@ -365,7 +414,13 @@ def _structured_tool_call(**kwargs: Any):
365414
366415 if "system" not in kwargs_full :
367416 if len (turns ) > 0 and turns [0 ].role == "system" :
368- kwargs_full ["system" ] = turns [0 ].text
417+ sys_param : "TextBlockParam" = {
418+ "type" : "text" ,
419+ "text" : turns [0 ].text ,
420+ }
421+ if self ._cache_control ():
422+ sys_param ["cache_control" ] = self ._cache_control ()
423+ kwargs_full ["system" ] = [sys_param ]
369424
370425 return kwargs_full
371426
@@ -418,11 +473,16 @@ def value_turn(self, completion, has_data_model) -> Turn:
418473
419474 def value_tokens (self , completion ):
420475 usage = completion .usage
421- # N.B. Currently, Anthropic doesn't cache by default and we currently do not support
422- # manual caching in chatlas. Note also that this only tracks reads, NOT writes, which
423- # have their own cost. To track that properly, we would need another caching category and per-token cost.
476+ input_tokens = completion .usage .input_tokens
477+
478+ # Account for cache writes by adjusting input tokens
479+ # Cache writes cost 125% for 5m and 200% for 1h
480+ # https://docs.claude.com/en/docs/build-with-claude/prompt-caching
481+ cache_input = usage .cache_creation_input_tokens or 0
482+ cache_mult = 2.0 if self ._cache == "1h" else 1.25
483+
424484 return (
425- completion . usage . input_tokens ,
485+ input_tokens + int ( cache_input * cache_mult ) ,
426486 completion .usage .output_tokens ,
427487 usage .cache_read_input_tokens if usage .cache_read_input_tokens else 0 ,
428488 )
@@ -510,13 +570,21 @@ def supported_model_params(self) -> set[StandardModelParamNames]:
510570
511571 def _as_message_params (self , turns : list [Turn ]) -> list ["MessageParam" ]:
512572 messages : list ["MessageParam" ] = []
513- for turn in turns :
573+ for i , turn in enumerate ( turns ) :
514574 if turn .role == "system" :
515575 continue # system prompt passed as separate arg
516576 if turn .role not in ["user" , "assistant" ]:
517577 raise ValueError (f"Unknown role { turn .role } " )
518578
519579 content = [self ._as_content_block (c ) for c in turn .contents ]
580+
581+ # Add cache control to the last content block in the last turn
582+ # https://docs.claude.com/en/docs/build-with-claude/prompt-caching#how-automatic-prefix-checking-works
583+ is_last_turn = i == len (turns ) - 1
584+ if is_last_turn and len (content ) > 0 :
585+ if self ._cache_control ():
586+ content [- 1 ]["cache_control" ] = self ._cache_control ()
587+
520588 role = "user" if turn .role == "user" else "assistant"
521589 messages .append ({"role" : role , "content" : content })
522590 return messages
@@ -744,11 +812,20 @@ def batch_result_turn(self, result, has_data_model: bool = False) -> Turn | None
744812 message = result .result .message
745813 return self ._as_turn (message , has_data_model )
746814
815+ def _cache_control (self ) -> "Optional[CacheControlEphemeralParam]" :
816+ if self ._cache == "none" :
817+ return None
818+ return {
819+ "type" : "ephemeral" ,
820+ "ttl" : self ._cache ,
821+ }
822+
747823
748824def ChatBedrockAnthropic (
749825 * ,
750826 model : Optional [str ] = None ,
751827 max_tokens : int = 4096 ,
828+ cache : Literal ["5m" , "1h" , "none" ] = "5m" ,
752829 aws_secret_key : Optional [str ] = None ,
753830 aws_access_key : Optional [str ] = None ,
754831 aws_region : Optional [str ] = None ,
@@ -804,6 +881,10 @@ def ChatBedrockAnthropic(
804881 The model to use for the chat.
805882 max_tokens
806883 Maximum number of tokens to generate before stopping.
884+ cache
885+ How long to cache inputs? Defaults to "5m" (five minutes).
886+ Set to "none" to disable caching or "1h" to cache for one hour.
887+ See the Caching section of `ChatAnthropic` for details.
807888 aws_secret_key
808889 The AWS secret key to use for authentication.
809890 aws_access_key
@@ -885,6 +966,7 @@ def ChatBedrockAnthropic(
885966 provider = AnthropicBedrockProvider (
886967 model = model ,
887968 max_tokens = max_tokens ,
969+ cache = cache ,
888970 aws_secret_key = aws_secret_key ,
889971 aws_access_key = aws_access_key ,
890972 aws_region = aws_region ,
@@ -908,11 +990,17 @@ def __init__(
908990 aws_profile : str | None ,
909991 aws_session_token : str | None ,
910992 max_tokens : int = 4096 ,
993+ cache : Literal ["5m" , "1h" , "none" ] = "5m" ,
911994 base_url : str | None ,
912995 name : str = "AWS/Bedrock" ,
913996 kwargs : Optional ["ChatBedrockClientArgs" ] = None ,
914997 ):
915- super ().__init__ (name = name , model = model , max_tokens = max_tokens )
998+ super ().__init__ (
999+ name = name ,
1000+ model = model ,
1001+ max_tokens = max_tokens ,
1002+ cache = cache ,
1003+ )
9161004
9171005 try :
9181006 from anthropic import AnthropicBedrock , AsyncAnthropicBedrock
0 commit comments