From 7e71e51badc9e734182deb223910e13525184666 Mon Sep 17 00:00:00 2001 From: Toygar Tanyel Date: Wed, 8 Apr 2026 10:39:28 +0300 Subject: [PATCH] fix(langchain): handle prompt_tokens_details as dict in _parse_usage_model When LiteLLM proxy or OpenAI returns prompt_tokens_details as a dict (e.g. {"cached_tokens": 12000}), _parse_usage_model only handled the Vertex AI list format and silently dropped the dict via the isinstance(v, int) filter on line 1318. This caused cached token counts to be lost and input costs to be inflated in Langfuse, since prompt_tokens was never adjusted for cache hits. Add dict handling for prompt_tokens_details mirroring the existing input_token_details pattern: flatten keys as input_{key}, subtract from input total. Existing Vertex AI list handling is preserved via elif. Closes langfuse/langfuse#13024 --- langfuse/langchain/CallbackHandler.py | 19 +++++++- tests/test_parse_usage_model.py | 62 +++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/langfuse/langchain/CallbackHandler.py b/langfuse/langchain/CallbackHandler.py index 8d2c8db90..cddb1c63a 100644 --- a/langfuse/langchain/CallbackHandler.py +++ b/langfuse/langchain/CallbackHandler.py @@ -1256,8 +1256,25 @@ def _parse_usage_model(usage: Union[pydantic.BaseModel, dict]) -> Any: if "output" in usage_model: usage_model["output"] = max(0, usage_model["output"] - value) - # Vertex AI + # OpenAI / LiteLLM — prompt_tokens_details as dict + # e.g. {"cached_tokens": 12000} if "prompt_tokens_details" in usage_model and isinstance( + usage_model["prompt_tokens_details"], dict + ): + prompt_tokens_details = usage_model.pop("prompt_tokens_details") + + for key, value in prompt_tokens_details.items(): + if not isinstance(value, int): + continue + + usage_model[f"input_{key}"] = value + + if "input" in usage_model: + usage_model["input"] = max(0, usage_model["input"] - value) + + # Vertex AI — prompt_tokens_details as list + # e.g. [{"modality": "text", "token_count": N}] + elif "prompt_tokens_details" in usage_model and isinstance( usage_model["prompt_tokens_details"], list ): prompt_tokens_details = usage_model.pop("prompt_tokens_details") diff --git a/tests/test_parse_usage_model.py b/tests/test_parse_usage_model.py index df441523c..764d6132b 100644 --- a/tests/test_parse_usage_model.py +++ b/tests/test_parse_usage_model.py @@ -16,6 +16,68 @@ def test_standard_tier_input_token_details(): assert result["total"] == 14 +def test_prompt_tokens_details_dict_cached_tokens(): + """OpenAI/LiteLLM: prompt_tokens_details as dict with cached_tokens.""" + usage = { + "prompt_tokens": 15000, + "completion_tokens": 500, + "total_tokens": 15500, + "prompt_tokens_details": {"cached_tokens": 12000}, + } + result = _parse_usage_model(usage) + assert result["input"] == 3000 # 15000 - 12000 + assert result["output"] == 500 + assert result["total"] == 15500 + assert result["input_cached_tokens"] == 12000 + + +def test_prompt_tokens_details_dict_with_cache_creation(): + """OpenAI/LiteLLM: prompt_tokens_details dict + top-level cache_creation.""" + usage = { + "prompt_tokens": 15000, + "completion_tokens": 500, + "total_tokens": 15500, + "prompt_tokens_details": {"cached_tokens": 12000}, + "cache_creation_input_tokens": 3000, + } + result = _parse_usage_model(usage) + assert result["input"] == 3000 # 15000 - 12000 (cached_tokens only subtracted here) + assert result["input_cached_tokens"] == 12000 + assert result["cache_creation_input_tokens"] == 3000 + + +def test_prompt_tokens_details_list_vertex_ai(): + """Vertex AI: prompt_tokens_details as list — existing behavior preserved.""" + usage = { + "prompt_token_count": 1000, + "candidates_token_count": 200, + "total_token_count": 1200, + "prompt_tokens_details": [ + {"modality": "text", "token_count": 800}, + {"modality": "image", "token_count": 200}, + ], + } + result = _parse_usage_model(usage) + assert result["input"] == 0 # 1000 - 800 - 200 + assert result["output"] == 200 + assert result["total"] == 1200 + assert result["input_modality_text"] == 800 + assert result["input_modality_image"] == 200 + + +def test_prompt_tokens_details_dict_empty(): + """Empty dict prompt_tokens_details — no crash, input unchanged.""" + usage = { + "prompt_tokens": 5000, + "completion_tokens": 100, + "total_tokens": 5100, + "prompt_tokens_details": {}, + } + result = _parse_usage_model(usage) + assert result["input"] == 5000 + assert result["output"] == 100 + + def test_priority_tier_not_subtracted(): """Priority tier: 'priority' and 'priority_*' keys must NOT be subtracted.""" usage = {