From 56348a16b20152363d1e320237761ce05a5b7dcf Mon Sep 17 00:00:00 2001 From: Cozmopolit Date: Mon, 22 Dec 2025 22:04:34 +0100 Subject: [PATCH] feat(connectors): Support ImageContent in tool/function results Enable ImageContent preservation in function results for multimodal-capable connectors (Gemini 3+). Non-supporting connectors return clear error message. Changes: - FunctionCallsProcessor: Return object instead of string, preserve ImageContent - Gemini: Native support via FunctionResponse.Parts with inlineData - OpenAI/Bedrock Agents: Error handling with ImageContentNotSupportedErrorMessage Includes 5 new unit tests for ImageContent handling. Fixes #13430 --- .../BedrockAgentInvokeExtensions.cs | 18 ++++- .../Internal/AssistantMessageFactory.cs | 18 ++++- .../Internal/AssistantMessageFactoryTests.cs | 25 +++++++ .../Core/Gemini/GeminiRequestTests.cs | 65 +++++++++++++++++++ .../Core/Gemini/Models/GeminiPart.cs | 19 ++++++ .../Core/Gemini/Models/GeminiRequest.cs | 47 +++++++++++++- .../Core/ClientCore.ChatCompletion.cs | 11 +++- .../FunctionCalling/FunctionCallsProcessor.cs | 30 ++++++--- .../FunctionCallsProcessorTests.cs | 18 +++++ 9 files changed, 236 insertions(+), 15 deletions(-) diff --git a/dotnet/src/Agents/Bedrock/Extensions/BedrockAgentInvokeExtensions.cs b/dotnet/src/Agents/Bedrock/Extensions/BedrockAgentInvokeExtensions.cs index 5bdc214b4df7..b23c15dfcad1 100644 --- a/dotnet/src/Agents/Bedrock/Extensions/BedrockAgentInvokeExtensions.cs +++ b/dotnet/src/Agents/Bedrock/Extensions/BedrockAgentInvokeExtensions.cs @@ -214,7 +214,7 @@ private static SessionState CreateSessionStateWithFunctionResults(List { - { "TEXT", new ContentBody() { Body = FunctionCallsProcessor.ProcessFunctionResult(functionResult.Result ?? string.Empty) } } + { "TEXT", new ContentBody() { Body = GetFunctionResultAsString(functionResult.Result) } } } } }; @@ -222,4 +222,20 @@ private static SessionState CreateSessionStateWithFunctionResults(List + /// Processes a function result and returns a string representation. + /// Bedrock does not support multimodal tool results, so ImageContent returns an error message. + /// + private static string GetFunctionResultAsString(object? result) + { + var processed = FunctionCallsProcessor.ProcessFunctionResult(result ?? string.Empty); + + if (processed is ImageContent) + { + return FunctionCallsProcessor.ImageContentNotSupportedErrorMessage; + } + + return (string?)processed ?? string.Empty; + } } diff --git a/dotnet/src/Agents/OpenAI/Internal/AssistantMessageFactory.cs b/dotnet/src/Agents/OpenAI/Internal/AssistantMessageFactory.cs index 008e781fafa8..866eac8d88c5 100644 --- a/dotnet/src/Agents/OpenAI/Internal/AssistantMessageFactory.cs +++ b/dotnet/src/Agents/OpenAI/Internal/AssistantMessageFactory.cs @@ -71,8 +71,24 @@ public static IEnumerable GetMessageContents(ChatMessageContent else if (content is FunctionResultContent resultContent && resultContent.Result != null && !hasTextContent) { // Only convert a function result when text-content is not already present - yield return MessageContent.FromText(FunctionCallsProcessor.ProcessFunctionResult(resultContent.Result)); + yield return MessageContent.FromText(GetFunctionResultAsString(resultContent.Result)); } } } + + /// + /// Processes a function result and returns a string representation. + /// OpenAI Assistants do not support multimodal tool results, so ImageContent returns an error message. + /// + private static string GetFunctionResultAsString(object result) + { + var processed = FunctionCallsProcessor.ProcessFunctionResult(result); + + if (processed is ImageContent) + { + return FunctionCallsProcessor.ImageContentNotSupportedErrorMessage; + } + + return (string?)processed ?? string.Empty; + } } diff --git a/dotnet/src/Agents/UnitTests/OpenAI/Internal/AssistantMessageFactoryTests.cs b/dotnet/src/Agents/UnitTests/OpenAI/Internal/AssistantMessageFactoryTests.cs index 85d843465b3b..eebf29e8dffe 100644 --- a/dotnet/src/Agents/UnitTests/OpenAI/Internal/AssistantMessageFactoryTests.cs +++ b/dotnet/src/Agents/UnitTests/OpenAI/Internal/AssistantMessageFactoryTests.cs @@ -5,6 +5,7 @@ using Microsoft.SemanticKernel; using Microsoft.SemanticKernel.Agents.OpenAI.Internal; using Microsoft.SemanticKernel.ChatCompletion; + using OpenAI.Assistants; using Xunit; @@ -207,4 +208,28 @@ public void VerifyAssistantMessageAdapterGetMessageWithAll() Assert.NotNull(contents); Assert.Equal(3, contents.Length); } + + /// + /// Verify that ImageContent in FunctionResultContent returns error message + /// since OpenAI Assistants do not support multimodal tool results. + /// + [Fact] + public void VerifyAssistantMessageAdapterGetMessageWithImageContentInFunctionResult() + { + // Arrange: Create a FunctionResultContent containing ImageContent + var imageData = new ReadOnlyMemory([0x89, 0x50, 0x4E, 0x47]); // PNG magic bytes + var imageContent = new ImageContent(imageData, "image/png"); + var functionResultContent = new FunctionResultContent("TestFunction", "TestPlugin", "call-id", imageContent); + ChatMessageContent message = new(AuthorRole.Tool, items: [functionResultContent]); + + // Act + MessageContent[] contents = AssistantMessageFactory.GetMessageContents(message).ToArray(); + + // Assert: Should return error message since OpenAI Assistants don't support multimodal tool results + Assert.NotNull(contents); + Assert.Single(contents); + Assert.NotNull(contents.Single().Text); + // Expected error message from FunctionCallsProcessor.ImageContentNotSupportedErrorMessage + Assert.Equal("Error: This model does not support image content in tool results.", contents.Single().Text); + } } diff --git a/dotnet/src/Connectors/Connectors.Google.UnitTests/Core/Gemini/GeminiRequestTests.cs b/dotnet/src/Connectors/Connectors.Google.UnitTests/Core/Gemini/GeminiRequestTests.cs index 142f0b20e4c7..49111d0b16a2 100644 --- a/dotnet/src/Connectors/Connectors.Google.UnitTests/Core/Gemini/GeminiRequestTests.cs +++ b/dotnet/src/Connectors/Connectors.Google.UnitTests/Core/Gemini/GeminiRequestTests.cs @@ -781,6 +781,71 @@ public void FromChatHistoryMultiTurnConversationPreservesAllRoles() Assert.Equal("assistant-message-2", request.Contents[3].Parts![0].Text); } + [Fact] + public void FromChatHistoryImageContentInToolResultCreatesInlineDataPart() + { + // Arrange + ChatHistory chatHistory = []; + var imageBytes = new byte[] { 0x89, 0x50, 0x4E, 0x47 }; // PNG magic bytes + var imageContent = new ImageContent(imageBytes, "image/png"); + var kernelFunction = KernelFunctionFactory.CreateFromMethod(() => imageContent); + var toolCall = new GeminiFunctionToolCall(new GeminiPart.FunctionCallPart { FunctionName = "capture-screenshot" }); + GeminiFunctionToolResult toolCallResult = new(toolCall, new FunctionResult(kernelFunction, imageContent)); + chatHistory.Add(new GeminiChatMessageContent(AuthorRole.Tool, string.Empty, "modelId", toolCallResult)); + var executionSettings = new GeminiPromptExecutionSettings(); + + // Act + var request = GeminiRequest.FromChatHistoryAndExecutionSettings(chatHistory, executionSettings); + + // Assert + Assert.Single(request.Contents); + var part = request.Contents[0].Parts![0]; + Assert.NotNull(part.FunctionResponse); + Assert.Equal("capture-screenshot", part.FunctionResponse.FunctionName); + Assert.NotNull(part.FunctionResponse.Parts); + Assert.Single(part.FunctionResponse.Parts); + Assert.NotNull(part.FunctionResponse.Parts[0].InlineData); + Assert.Equal("image/png", part.FunctionResponse.Parts[0].InlineData!.MimeType); + Assert.Equal(Convert.ToBase64String(imageBytes), part.FunctionResponse.Parts[0].InlineData.InlineData); + } + + [Fact] + public void FromChatHistoryImageContentWithoutDataThrowsInvalidOperationException() + { + // Arrange + ChatHistory chatHistory = []; + var imageContent = new ImageContent(new Uri("https://example.com/image.png")) { MimeType = "image/png" }; + var kernelFunction = KernelFunctionFactory.CreateFromMethod(() => imageContent); + var toolCall = new GeminiFunctionToolCall(new GeminiPart.FunctionCallPart { FunctionName = "capture-screenshot" }); + GeminiFunctionToolResult toolCallResult = new(toolCall, new FunctionResult(kernelFunction, imageContent)); + chatHistory.Add(new GeminiChatMessageContent(AuthorRole.Tool, string.Empty, "modelId", toolCallResult)); + var executionSettings = new GeminiPromptExecutionSettings(); + + // Act & Assert + var exception = Assert.Throws( + () => GeminiRequest.FromChatHistoryAndExecutionSettings(chatHistory, executionSettings)); + Assert.Equal("ImageContent in function result must contain binary data.", exception.Message); + } + + [Fact] + public void FromChatHistoryImageContentWithoutMimeTypeThrowsInvalidOperationException() + { + // Arrange + ChatHistory chatHistory = []; + ReadOnlyMemory imageBytes = new byte[] { 0x89, 0x50, 0x4E, 0x47 }; + var imageContent = new ImageContent(imageBytes, mimeType: null); // No MimeType + var kernelFunction = KernelFunctionFactory.CreateFromMethod(() => imageContent); + var toolCall = new GeminiFunctionToolCall(new GeminiPart.FunctionCallPart { FunctionName = "capture-screenshot" }); + GeminiFunctionToolResult toolCallResult = new(toolCall, new FunctionResult(kernelFunction, imageContent)); + chatHistory.Add(new GeminiChatMessageContent(AuthorRole.Tool, string.Empty, "modelId", toolCallResult)); + var executionSettings = new GeminiPromptExecutionSettings(); + + // Act & Assert + var exception = Assert.Throws( + () => GeminiRequest.FromChatHistoryAndExecutionSettings(chatHistory, executionSettings)); + Assert.Equal("Image content MimeType is empty.", exception.Message); + } + private sealed class DummyContent(object? innerContent, string? modelId = null, IReadOnlyDictionary? metadata = null) : KernelContent(innerContent, modelId, metadata); diff --git a/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiPart.cs b/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiPart.cs index 725e6ae54fb3..a67a4f5f5fe7 100644 --- a/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiPart.cs +++ b/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiPart.cs @@ -172,6 +172,14 @@ internal sealed class FunctionResponsePart [JsonRequired] public FunctionResponseEntity Response { get; set; } = null!; + /// + /// Optional. Nested parts for multimodal function responses (Gemini 3+ only). + /// Contains inlineData with image/binary data as part of tool results. + /// + [JsonPropertyName("parts")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public FunctionResponsePartContent[]? Parts { get; set; } + internal sealed class FunctionResponseEntity { [JsonConstructor] @@ -189,5 +197,16 @@ public FunctionResponseEntity(object? response) [JsonRequired] public JsonNode Arguments { get; set; } = null!; } + + /// + /// Represents a part within a Gemini function response (for multimodal content). + /// Used in Gemini 3+ to include images/binary data as part of tool results. + /// + internal sealed class FunctionResponsePartContent + { + [JsonPropertyName("inlineData")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public InlineDataPart? InlineData { get; set; } + } } } diff --git a/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiRequest.cs b/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiRequest.cs index 49a24f05f041..617526304dc7 100644 --- a/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiRequest.cs +++ b/dotnet/src/Connectors/Connectors.Google/Core/Gemini/Models/GeminiRequest.cs @@ -194,14 +194,24 @@ private static List CreateGeminiParts(ChatMessageContent content) case GeminiChatMessageContent { CalledToolResults: not null } contentWithCalledTools: // Add all function responses as separate parts in a single message parts.AddRange(contentWithCalledTools.CalledToolResults.Select(toolResult => - new GeminiPart + { + var resultValue = toolResult.FunctionResult.GetValue(); + + // Handle ImageContent for multimodal tool results (Gemini 3+ only) + if (resultValue is ImageContent imageContent) + { + return CreateImageFunctionResponsePart(toolResult.FullyQualifiedName, imageContent); + } + + return new GeminiPart { FunctionResponse = new GeminiPart.FunctionResponsePart { FunctionName = toolResult.FullyQualifiedName, - Response = new(toolResult.FunctionResult.GetValue()) + Response = new(resultValue) } - })); + }; + })); break; case GeminiChatMessageContent { ToolCalls: not null } contentWithToolCalls: parts.AddRange(contentWithToolCalls.ToolCalls.Select(toolCall => @@ -272,6 +282,37 @@ private static string GetMimeTypeFromImageContent(ImageContent imageContent) ?? throw new InvalidOperationException("Image content MimeType is empty."); } + /// + /// Creates a GeminiPart with FunctionResponse containing multimodal image data (Gemini 3+ only). + /// + private static GeminiPart CreateImageFunctionResponsePart(string functionName, ImageContent imageContent) + { + if (imageContent.Data is not { IsEmpty: false }) + { + throw new InvalidOperationException("ImageContent in function result must contain binary data."); + } + + return new GeminiPart + { + FunctionResponse = new GeminiPart.FunctionResponsePart + { + FunctionName = functionName, + Response = new(new { status = "success", message = "Image data attached" }), + Parts = + [ + new GeminiPart.FunctionResponsePart.FunctionResponsePartContent + { + InlineData = new GeminiPart.InlineDataPart + { + MimeType = GetMimeTypeFromImageContent(imageContent), + InlineData = Convert.ToBase64String(imageContent.Data.Value.ToArray()) + } + } + ] + } + }; + } + private static GeminiPart CreateGeminiPartFromAudio(AudioContent audioContent) { // Binary data takes precedence over URI. diff --git a/dotnet/src/Connectors/Connectors.OpenAI/Core/ClientCore.ChatCompletion.cs b/dotnet/src/Connectors/Connectors.OpenAI/Core/ClientCore.ChatCompletion.cs index 3387601ed189..d0a05709d08f 100644 --- a/dotnet/src/Connectors/Connectors.OpenAI/Core/ClientCore.ChatCompletion.cs +++ b/dotnet/src/Connectors/Connectors.OpenAI/Core/ClientCore.ChatCompletion.cs @@ -765,9 +765,16 @@ private static List CreateRequestMessages(ChatMessageContent messag continue; } - var stringResult = FunctionCalling.FunctionCallsProcessor.ProcessFunctionResult(resultContent.Result ?? string.Empty); + var result = FunctionCalling.FunctionCallsProcessor.ProcessFunctionResult(resultContent.Result ?? string.Empty); - toolMessages.Add(new ToolChatMessage(resultContent.CallId, stringResult ?? string.Empty)); + // OpenAI does not support multimodal tool results - return error message for ImageContent + if (result is ImageContent) + { + toolMessages.Add(new ToolChatMessage(resultContent.CallId, FunctionCalling.FunctionCallsProcessor.ImageContentNotSupportedErrorMessage)); + continue; + } + + toolMessages.Add(new ToolChatMessage(resultContent.CallId, (string?)result ?? string.Empty)); } if (toolMessages is not null) diff --git a/dotnet/src/InternalUtilities/connectors/AI/FunctionCalling/FunctionCallsProcessor.cs b/dotnet/src/InternalUtilities/connectors/AI/FunctionCalling/FunctionCallsProcessor.cs index ddca98c7c053..1be888c9cb05 100644 --- a/dotnet/src/InternalUtilities/connectors/AI/FunctionCalling/FunctionCallsProcessor.cs +++ b/dotnet/src/InternalUtilities/connectors/AI/FunctionCalling/FunctionCallsProcessor.cs @@ -42,6 +42,11 @@ internal sealed class FunctionCallsProcessor /// private const int MaxInflightAutoInvokes = 128; + /// + /// Error message returned when a connector does not support ImageContent in tool results. + /// + public const string ImageContentNotSupportedErrorMessage = "Error: This model does not support image content in tool results."; + /// /// The maximum number of function auto-invokes that can be made in a single user request. /// @@ -340,7 +345,7 @@ private static bool TryValidateFunctionCall( return false; } - private record struct FunctionResultContext(AutoFunctionInvocationContext Context, FunctionCallContent FunctionCall, string? Result, string? ErrorMessage); + private record struct FunctionResultContext(AutoFunctionInvocationContext Context, FunctionCallContent FunctionCall, object? Result, string? ErrorMessage); private async Task ExecuteFunctionCallAsync( AutoFunctionInvocationContext invocationContext, @@ -377,8 +382,8 @@ await this.OnAutoFunctionInvocationAsync( } // Apply any changes from the auto function invocation filters context to final result. - string stringResult = ProcessFunctionResult(invocationContext.Result.GetValue() ?? string.Empty); - return new FunctionResultContext(invocationContext, functionCall, stringResult, null); + object result = ProcessFunctionResult(invocationContext.Result.GetValue() ?? string.Empty); + return new FunctionResultContext(invocationContext, functionCall, result, null); } /// @@ -388,7 +393,8 @@ await this.OnAutoFunctionInvocationAsync( /// The function result context. private void AddFunctionCallResultToChatHistory(ChatHistory chatHistory, FunctionResultContext resultContext) { - var message = new ChatMessageContent(role: AuthorRole.Tool, content: resultContext.Result); + // When Result is ImageContent, Content will be null - the actual result is in FunctionResultContent.Result + var message = new ChatMessageContent(role: AuthorRole.Tool, content: resultContext.Result as string); message.Items.Add(this.GenerateResultContent(resultContext)); chatHistory.Add(message); } @@ -419,9 +425,9 @@ private FunctionResultContent GenerateResultContent(FunctionResultContext result /// Creates a instance. /// /// The function call content. - /// The function result, if available + /// The function result, if available. Can be string or ImageContent. /// An error message. - private FunctionResultContent GenerateResultContent(FunctionCallContent functionCall, string? result, string? errorMessage) + private FunctionResultContent GenerateResultContent(FunctionCallContent functionCall, object? result, string? errorMessage) { // Log any error if (errorMessage is not null) @@ -429,6 +435,7 @@ private FunctionResultContent GenerateResultContent(FunctionCallContent function this._logger.LogFunctionCallRequestFailure(functionCall, errorMessage); } + // FunctionResultContent.Result is object? - pass through string or ImageContent directly return new FunctionResultContent(functionCall.FunctionName, functionCall.PluginName, functionCall.Id, result ?? errorMessage ?? string.Empty); } @@ -481,14 +488,21 @@ await autoFunctionInvocationFilters[index].OnAutoFunctionInvocationAsync( /// Processes the function result. /// /// The result of the function call. - /// A string representation of the function result. - public static string ProcessFunctionResult(object functionResult) + /// A string representation of the function result, or the original ImageContent for multimodal-capable connectors. + public static object ProcessFunctionResult(object functionResult) { if (functionResult is string stringResult) { return stringResult; } + // Preserve ImageContent for connectors that support multimodal tool results (e.g., Gemini 3+, Anthropic) + // Connectors that don't support this should check for ImageContent and return an appropriate error message. + if (functionResult is ImageContent) + { + return functionResult; + } + // This is an optimization to use ChatMessageContent content directly // without unnecessary serialization of the whole message content class. if (functionResult is ChatMessageContent chatMessageContent) diff --git a/dotnet/src/SemanticKernel.UnitTests/Utilities/AIConnectors/FunctionCallsProcessorTests.cs b/dotnet/src/SemanticKernel.UnitTests/Utilities/AIConnectors/FunctionCallsProcessorTests.cs index e1258d124c6a..1fc60b79e880 100644 --- a/dotnet/src/SemanticKernel.UnitTests/Utilities/AIConnectors/FunctionCallsProcessorTests.cs +++ b/dotnet/src/SemanticKernel.UnitTests/Utilities/AIConnectors/FunctionCallsProcessorTests.cs @@ -855,6 +855,24 @@ public void ItShouldSerializeFunctionResultsWithStringProperties() Assert.Equal("{\"Text\":\"テスト\"}", result); } + [Fact] + public void ItShouldPreserveImageContentWithoutSerialization() + { + // Arrange + var imageData = new byte[] { 0x89, 0x50, 0x4E, 0x47 }; // PNG magic bytes + var functionResult = new ImageContent(imageData, "image/png"); + + // Act + var result = FunctionCallsProcessor.ProcessFunctionResult(functionResult); + + // Assert + Assert.IsType(result); + var imageResult = (ImageContent)result; + Assert.Equal("image/png", imageResult.MimeType); + Assert.NotNull(imageResult.Data); + Assert.Equal(imageData, imageResult.Data.Value.ToArray()); + } + [Fact] public async Task ItShouldPassPromptExecutionSettingsToAutoFunctionInvocationFilterAsync() {