Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions sentry_sdk/integrations/pydantic_ai/consts.py
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
import re

SPAN_ORIGIN = "auto.ai.pydantic_ai"

# Matches data URLs with base64-encoded content, e.g. "data:image/png;base64,iVBORw0K..."
# Group 1: MIME type (e.g. "image/png"), Group 2: base64 data
DATA_URL_BASE64_REGEX = re.compile(
r"^data:([a-zA-Z]+/[a-zA-Z]+);base64,([A-Za-z0-9+/\-_]+={0,2})$"
)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regex fails to match common MIME types, leaking data

High Severity

DATA_URL_BASE64_REGEX uses [a-zA-Z]+/[a-zA-Z]+ for the MIME type, which only matches pure alphabetic characters. Valid MIME types like video/mp4, audio/mp3, image/svg+xml, image/x-icon, and font/woff2 contain digits, hyphens, or plus signs and won't match. When a valid base64 data URL fails to match, _serialize_image_url_item falls through to the non-redacted path and exposes the full base64-encoded content in span data. An existing parse_data_uri utility in sentry_sdk/ai/utils.py already handles data URI parsing robustly per RFC 2397 and is used elsewhere in the codebase.

Additional Locations (1)
Fix in Cursor Fix in Web

12 changes: 8 additions & 4 deletions sentry_sdk/integrations/pydantic_ai/patches/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,15 @@ async def wrapped_execute_tool_call(
call = validated.call
name = call.tool_name
tool = self.tools.get(name) if self.tools else None
selected_tool_definition = getattr(tool, "tool_def", None)

# Determine tool type by checking tool.toolset
tool_type = "function"
if tool and HAS_MCP and isinstance(tool.toolset, MCPServer):
tool_type = "mcp"

tool_def = getattr(tool, "tool_def", None)
tool_description = getattr(tool_def, "description", None)

# Get agent from contextvar
agent = get_current_agent()

Expand All @@ -74,7 +76,7 @@ async def wrapped_execute_tool_call(
args_dict,
agent,
tool_type=tool_type,
tool_definition=selected_tool_definition,
tool_description=tool_description,
) as span:
try:
result = await original_execute_tool_call(
Expand Down Expand Up @@ -129,13 +131,15 @@ async def wrapped_call_tool(
# Extract tool info before calling original
name = call.tool_name
tool = self.tools.get(name) if self.tools else None
selected_tool_definition = getattr(tool, "tool_def", None)

# Determine tool type by checking tool.toolset
tool_type = "function" # default
if tool and HAS_MCP and isinstance(tool.toolset, MCPServer):
tool_type = "mcp"

tool_def = getattr(tool, "tool_def", None)
tool_description = getattr(tool_def, "description", None)

# Get agent from contextvar
agent = get_current_agent()

Expand All @@ -153,7 +157,7 @@ async def wrapped_call_tool(
args_dict,
agent,
tool_type=tool_type,
tool_definition=selected_tool_definition,
tool_description=tool_description,
) as span:
try:
result = await original_call_tool(
Expand Down
24 changes: 10 additions & 14 deletions sentry_sdk/integrations/pydantic_ai/spans/ai_client.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import json

import sentry_sdk
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
from sentry_sdk.ai.utils import (
normalize_message_roles,
set_data_normalized,
truncate_and_annotate_messages,
get_modality_from_mime_type,
)
from sentry_sdk.consts import OP, SPANDATA
from sentry_sdk.utils import safe_serialize
Expand All @@ -21,7 +19,11 @@
get_current_agent,
get_is_streaming,
)
from .utils import _set_usage_data
from .utils import (
_serialize_binary_content_item,
_serialize_image_url_item,
_set_usage_data,
)

from typing import TYPE_CHECKING

Expand All @@ -40,6 +42,7 @@
TextPart,
ThinkingPart,
BinaryContent,
ImageUrl,
)
except ImportError:
# Fallback if these classes are not available
Expand All @@ -50,6 +53,7 @@
TextPart = None
ThinkingPart = None
BinaryContent = None
ImageUrl = None


def _transform_system_instructions(
Expand Down Expand Up @@ -158,22 +162,14 @@ def _set_input_messages(span: "sentry_sdk.tracing.Span", messages: "Any") -> Non
for item in part.content:
if isinstance(item, str):
content.append({"type": "text", "text": item})
elif ImageUrl and isinstance(item, ImageUrl):
content.append(_serialize_image_url_item(item))
elif BinaryContent and isinstance(item, BinaryContent):
content.append(
{
"type": "blob",
"modality": get_modality_from_mime_type(
item.media_type
),
"mime_type": item.media_type,
"content": BLOB_DATA_SUBSTITUTE,
}
)
content.append(_serialize_binary_content_item(item))
else:
content.append(safe_serialize(item))
else:
content.append({"type": "text", "text": str(part.content)})

# Add message if we have content or tool calls
if content or tool_calls:
message: "Dict[str, Any]" = {"role": role}
Expand Down
12 changes: 4 additions & 8 deletions sentry_sdk/integrations/pydantic_ai/spans/execute_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@

if TYPE_CHECKING:
from typing import Any, Optional
from pydantic_ai._tool_manager import ToolDefinition # type: ignore


def execute_tool_span(
tool_name: str,
tool_args: "Any",
agent: "Any",
tool_type: str = "function",
tool_definition: "Optional[ToolDefinition]" = None,
tool_description: "Optional[str]" = None,
) -> "sentry_sdk.tracing.Span":
Comment on lines 14 to 20
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removal of GEN_AI_TOOL_DESCRIPTION will break existing test

The code removes the logic that sets GEN_AI_TOOL_DESCRIPTION on execute_tool spans (lines 35-40 being deleted). However, the test test_tool_description_in_execute_tool_span in tests/integrations/pydantic_ai/test_pydantic_ai.py (lines 2835-2836) explicitly asserts that this field should be present and contain the tool's docstring. Since the test file is not listed as being modified in this PR, this change will cause test failures.

Verification

Verified by reading the test file at lines 2799-2836 which shows test_tool_description_in_execute_tool_span asserting SPANDATA.GEN_AI_TOOL_DESCRIPTION in tool_span['data']. Confirmed the test file is not in the PR's list of modified files. Checked grep results showing the deleted code was the only place setting this field for pydantic_ai execute_tool spans.

Identified by Warden code-review · JDR-KXH

"""Create a span for tool execution.

Expand All @@ -26,7 +25,7 @@ def execute_tool_span(
tool_args: The arguments passed to the tool
agent: The agent executing the tool
tool_type: The type of tool ("function" for regular tools, "mcp" for MCP services)
tool_definition: The definition of the tool, if available
tool_description: Optional description of the tool
"""
span = sentry_sdk.start_span(
op=OP.GEN_AI_EXECUTE_TOOL,
Expand All @@ -38,11 +37,8 @@ def execute_tool_span(
span.set_data(SPANDATA.GEN_AI_TOOL_TYPE, tool_type)
span.set_data(SPANDATA.GEN_AI_TOOL_NAME, tool_name)

if tool_definition is not None and hasattr(tool_definition, "description"):
span.set_data(
SPANDATA.GEN_AI_TOOL_DESCRIPTION,
tool_definition.description,
)
if tool_description is not None:
span.set_data(SPANDATA.GEN_AI_TOOL_DESCRIPTION, tool_description)

_set_agent_data(span, agent)

Expand Down
24 changes: 10 additions & 14 deletions sentry_sdk/integrations/pydantic_ai/spans/invoke_agent.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import sentry_sdk
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
from sentry_sdk.ai.utils import (
get_modality_from_mime_type,
get_start_span_function,
normalize_message_roles,
set_data_normalized,
Expand All @@ -16,17 +14,22 @@
_set_model_data,
_should_send_prompts,
)
from .utils import _set_usage_data
from .utils import (
_serialize_binary_content_item,
_serialize_image_url_item,
_set_usage_data,
)

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Any

try:
from pydantic_ai.messages import BinaryContent # type: ignore
from pydantic_ai.messages import BinaryContent, ImageUrl # type: ignore
except ImportError:
BinaryContent = None
ImageUrl = None


def invoke_agent_span(
Expand Down Expand Up @@ -105,17 +108,10 @@ def invoke_agent_span(
for item in user_prompt:
if isinstance(item, str):
content.append({"text": item, "type": "text"})
elif ImageUrl and isinstance(item, ImageUrl):
content.append(_serialize_image_url_item(item))
elif BinaryContent and isinstance(item, BinaryContent):
content.append(
{
"type": "blob",
"modality": get_modality_from_mime_type(
item.media_type
),
"mime_type": item.media_type,
"content": BLOB_DATA_SUBSTITUTE,
}
)
content.append(_serialize_binary_content_item(item))
if content:
messages.append(
{
Expand Down
44 changes: 43 additions & 1 deletion sentry_sdk/integrations/pydantic_ai/spans/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,56 @@
"""Utility functions for PydanticAI span instrumentation."""

import sentry_sdk
from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
from sentry_sdk.ai.utils import get_modality_from_mime_type
from sentry_sdk.consts import SPANDATA

from ..consts import DATA_URL_BASE64_REGEX

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Union, Dict, Any, List
from typing import Union, Dict, Any, List, Optional
from pydantic_ai.usage import RequestUsage, RunUsage # type: ignore

try:
from pydantic_ai.messages import BinaryContent, ImageUrl # type: ignore
except ImportError:
BinaryContent = None
ImageUrl = None
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unused runtime imports of BinaryContent and ImageUrl

Low Severity

BinaryContent and ImageUrl are imported at runtime in spans/utils.py but never referenced anywhere in the file. The helper functions _serialize_image_url_item and _serialize_binary_content_item both accept item: "Any" and access attributes directly — the isinstance checks happen in the calling files (ai_client.py and invoke_agent.py), which have their own imports. These are dead runtime imports that add unnecessary coupling to pydantic_ai.messages.

Fix in Cursor Fix in Web



def _serialize_image_url_item(item: "Any") -> "Dict[str, Any]":
"""Serialize an ImageUrl content item for span data.

For data URLs containing base64-encoded images, the content is redacted.
For regular HTTP URLs, the URL string is preserved.
"""
data_url_matches = DATA_URL_BASE64_REGEX.match(item.url)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: The function _serialize_image_url_item will raise a TypeError because it attempts a regex match on a Pydantic Url object without first converting it to a string.
Severity: HIGH

Suggested Fix

Convert item.url to a string before performing the regex match, similar to how it is handled elsewhere in the function. Change DATA_URL_BASE64_REGEX.match(item.url) to DATA_URL_BASE64_REGEX.match(str(item.url)).

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: sentry_sdk/integrations/pydantic_ai/spans/utils.py#L29

Potential issue: The function `_serialize_image_url_item` calls
`DATA_URL_BASE64_REGEX.match(item.url)` without first converting `item.url` to a string.
According to Pydantic V2's documentation, URL fields are stored as
`pydantic_core._pydantic_core.Url` objects, not strings. The `re.match` function expects
a string or bytes-like object, so this operation will raise a `TypeError`. While this
error is caught in the `ai_client.py` integration, it is not handled in
`invoke_agent.py`, which will cause the instrumentation to crash when an `ImageUrl` is
processed.

Did we get this right? 👍 / 👎 to inform future reviews.


if data_url_matches:
mime_type = data_url_matches[1] or "image"
return {
"type": "image",
"mime_type": mime_type,
"content": BLOB_DATA_SUBSTITUTE,
}

return {
"type": "image",
"content": str(item.url),
Comment on lines +29 to +41
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regex fails to match valid base64 data URLs with complex MIME types

The DATA_URL_BASE64_REGEX pattern ^data:([a-zA-Z]+/[a-zA-Z]+);base64,... only matches simple MIME types with alphanumeric characters. Valid MIME types like image/svg+xml or application/vnd.api+json (containing + or .) will not match, causing _serialize_image_url_item to fall through to the non-data-URL branch and expose the full base64-encoded image content in span data instead of redacting it.

Verification

Verified by reading consts.py (line 7-9) which defines the regex as ^data:([a-zA-Z]+/[a-zA-Z]+);base64,.... The character class [a-zA-Z]+ in the MIME subtype does not include +, ., or - characters. Compared with parse_data_uri in sentry_sdk/ai/utils.py (lines 43-72) which uses string splitting and handles edge cases gracefully.

Also found at 2 additional locations
  • sentry_sdk/integrations/pydantic_ai/consts.py:7-9
  • sentry_sdk/integrations/pydantic_ai/spans/ai_client.py:165-166

Identified by Warden code-review · NKQ-KBQ

}


def _serialize_binary_content_item(item: "Any") -> "Dict[str, Any]":
"""Serialize a BinaryContent item for span data, redacting the blob data."""
return {
"type": "blob",
"modality": get_modality_from_mime_type(item.media_type),
"mime_type": item.media_type,
"content": BLOB_DATA_SUBSTITUTE,
}


def _set_usage_data(
span: "sentry_sdk.tracing.Span", usage: "Union[RequestUsage, RunUsage]"
Expand Down
Loading