diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index 60ec1c4f08..32d1fddc31 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -57,6 +57,537 @@ static std::string joinServerSideEvents(const std::vector& events) return ss.str(); } +// Convert the Responses API tools array (flat function format) into the chat/completions +// nested format ({type:"function", function:{name, description, parameters, ...}}) in place +// on the request document. The chat template (e.g. gpt-oss) and the chat/completions tools +// schema both expect the nested shape; doing this once up front lets every downstream +// consumer (chat history path, processedJson builder for Python Jinja, parseToolsToJsonContainer) +// share the same representation. Tools already in nested form, or non-function tools, are +// left untouched. +static void convertResponsesToolsInPlace(rapidjson::Value& toolsArray, rapidjson::Document::AllocatorType& alloc) { + if (!toolsArray.IsArray()) { + return; + } + for (auto& tool : toolsArray.GetArray()) { + if (!tool.IsObject()) { + continue; + } + auto toolObj = tool.GetObject(); + if (toolObj.FindMember("function") != toolObj.MemberEnd()) { + continue; // Already in nested chat/completions format. + } + auto typeIt = toolObj.FindMember("type"); + const std::string toolType = (typeIt != toolObj.MemberEnd() && typeIt->value.IsString()) + ? typeIt->value.GetString() + : ""; + if (toolType != "function") { + continue; // Preserve non-function tools as-is. + } + rapidjson::Value funcObj(rapidjson::kObjectType); + for (auto memberIt = toolObj.MemberBegin(); memberIt != toolObj.MemberEnd();) { + if (!memberIt->name.IsString()) { + ++memberIt; + continue; + } + const std::string fieldName = memberIt->name.GetString(); + if (fieldName == "type") { + ++memberIt; + continue; + } + if (fieldName == "response") { + memberIt = tool.EraseMember(memberIt); + continue; + } + rapidjson::Value keyCopy(memberIt->name, alloc); + rapidjson::Value valCopy(memberIt->value, alloc); + funcObj.AddMember(keyCopy, valCopy, alloc); + memberIt = tool.EraseMember(memberIt); + } + tool.AddMember("function", funcObj, alloc); + } +} + +// Pull the reasoning text out of a Responses API "reasoning" item. +// Prefers the newer content[].text shape over the legacy summary[].text shape. +static std::string extractReasoningText(const rapidjson::Value::ConstObject& itemObj) { + auto contentIt = itemObj.FindMember("content"); + if (contentIt != itemObj.MemberEnd() && contentIt->value.IsArray()) { + for (const auto& ci : contentIt->value.GetArray()) { + if (!ci.IsObject()) + continue; + auto textIt = ci.GetObject().FindMember("text"); + if (textIt != ci.GetObject().MemberEnd() && textIt->value.IsString()) { + return textIt->value.GetString(); + } + } + } + auto summaryIt = itemObj.FindMember("summary"); + if (summaryIt != itemObj.MemberEnd() && summaryIt->value.IsArray()) { + for (const auto& si : summaryIt->value.GetArray()) { + if (!si.IsObject()) + continue; + auto textIt = si.GetObject().FindMember("text"); + if (textIt != si.GetObject().MemberEnd() && textIt->value.IsString()) { + return textIt->value.GetString(); + } + } + } + return ""; +} + +// Extract a flat text string from a Responses API content field which may be +// either a string or an array of {type,text} objects. +static std::string extractTextContent(const rapidjson::Value& contentVal) { + if (contentVal.IsString()) { + return contentVal.GetString(); + } + if (!contentVal.IsArray()) { + return ""; + } + for (const auto& ci : contentVal.GetArray()) { + if (!ci.IsObject()) + continue; + auto ctTypeIt = ci.GetObject().FindMember("type"); + if (ctTypeIt == ci.GetObject().MemberEnd() || !ctTypeIt->value.IsString()) + continue; + const std::string ctType = ctTypeIt->value.GetString(); + if (ctType == "input_text" || ctType == "output_text") { + auto textIt = ci.GetObject().FindMember("text"); + if (textIt != ci.GetObject().MemberEnd() && textIt->value.IsString()) { + return textIt->value.GetString(); + } + } + } + return ""; +} + +// Read the three string fields (id, name, arguments) out of a function_call item. +struct FunctionCallFields { + std::string id; + std::string name; + std::string arguments; +}; +static FunctionCallFields readFunctionCallFields(const rapidjson::Value& item) { + FunctionCallFields out; + auto fcObj = item.GetObject(); + auto idIt = fcObj.FindMember("id"); + if (idIt != fcObj.MemberEnd() && idIt->value.IsString()) + out.id = idIt->value.GetString(); + auto nameIt = fcObj.FindMember("name"); + if (nameIt != fcObj.MemberEnd() && nameIt->value.IsString()) + out.name = nameIt->value.GetString(); + auto argsIt = fcObj.FindMember("arguments"); + if (argsIt != fcObj.MemberEnd() && argsIt->value.IsString()) + out.arguments = argsIt->value.GetString(); + return out; +} + +// Classification of a Responses API input item used to dispatch to per-type +// handlers in the builders below. +enum class ResponsesInputItemKind { + REASONING, + FUNCTION_CALL, + FUNCTION_CALL_OUTPUT, + ROLE_ITEM, + MISSING_ROLE, +}; + +static absl::StatusOr classifyInputItem(const rapidjson::Value& item) { + if (!item.IsObject()) { + return absl::InvalidArgumentError("input array items must be objects"); + } + auto itemObj = item.GetObject(); + auto itemTypeIt = itemObj.FindMember("type"); + const std::string itemType = (itemTypeIt != itemObj.MemberEnd() && itemTypeIt->value.IsString()) + ? itemTypeIt->value.GetString() + : ""; + if (itemType == "reasoning") + return ResponsesInputItemKind::REASONING; + if (itemType == "function_call") + return ResponsesInputItemKind::FUNCTION_CALL; + if (itemType == "function_call_output") + return ResponsesInputItemKind::FUNCTION_CALL_OUTPUT; + auto roleIt = itemObj.FindMember("role"); + if (roleIt == itemObj.MemberEnd() || !roleIt->value.IsString()) + return ResponsesInputItemKind::MISSING_ROLE; + return ResponsesInputItemKind::ROLE_ITEM; +} + +// Builds chat/completions-shaped messages from a Responses API input array. +// +// Reasoning items are buffered and attached as `reasoning_content` on the next +// assistant message (matching the gpt-oss template's expected field). +// Reasoning that is not followed by an assistant/function_call item is dropped, +// since emitting a standalone {role:assistant, reasoning_content:...} message +// with no content/tool_calls would confuse most chat templates. +// +// Pending function_call items are merged into the next assistant message as a +// chat/completions-shaped tool_calls[] array. Without this, the assistant turn +// would have no tool_calls field, the chat template would treat it as a final +// answer, and a subsequent tool message would fail (e.g. gpt-oss raises +// "Message has tool role, but there was no previous assistant message with a +// tool call!"). +// +// Reasoning that is not followed by an assistant or function_call item is +// emitted as a standalone assistant turn with empty content and the buffered +// reasoning attached as `reasoning_content`. This preserves the model's +// chain-of-thought across turns even when the prior turn produced no visible +// output. +// +// The algorithm is sink-agnostic; concrete output (ov::genai::ChatHistory vs a +// rapidjson messages array) is provided by the Sink template parameter, which +// must implement: +// absl::Status extractContent(itemObj, index, std::string& outText); +// void emitToolMessage(callId, output); +// void emitMessage(role, contentText, reasoning); // reasoning empty -> skip +// void emitAssistantWithToolCalls(contentText, reasoning, toolCalls); +// void emitStandaloneReasoning(reasoning); // assistant turn carrying only reasoning_content +// absl::Status onMissingRole(itemObj); +template +class ResponsesInputBuilder { +public: + explicit ResponsesInputBuilder(Sink& sink) : + sink(sink) {} + + absl::Status build(const rapidjson::Value& inputArray) { + if (!inputArray.IsArray()) { + return absl::InvalidArgumentError("input is not an array"); + } + for (rapidjson::SizeType i = 0; i < inputArray.GetArray().Size(); ++i) { + const auto& item = inputArray.GetArray()[i]; + auto kind = classifyInputItem(item); + if (!kind.ok()) + return kind.status(); + absl::Status status; + switch (kind.value()) { + case ResponsesInputItemKind::REASONING: + status = onReasoningItem(item.GetObject()); + break; + case ResponsesInputItemKind::FUNCTION_CALL: + pendingFunctionCalls.push_back(&item); + break; + case ResponsesInputItemKind::FUNCTION_CALL_OUTPUT: + status = onFunctionCallOutputItem(item.GetObject()); + break; + case ResponsesInputItemKind::ROLE_ITEM: + status = onRoleItem(item.GetObject(), i); + break; + case ResponsesInputItemKind::MISSING_ROLE: + status = sink.onMissingRole(item.GetObject()); + break; + } + if (!status.ok()) + return status; + } + // Flush any trailing buffered function_calls (e.g. input ends with a + // function_call item that has no corresponding output yet). + flushPendingFunctionCalls(""); + return absl::OkStatus(); + } + +private: + absl::Status onReasoningItem(const rapidjson::Value::ConstObject& itemObj) { + std::string text = extractReasoningText(itemObj); + if (!text.empty()) { + if (!pendingReasoningContent.empty()) + pendingReasoningContent += "\n"; + pendingReasoningContent += text; + } + return absl::OkStatus(); + } + + absl::Status onFunctionCallOutputItem(const rapidjson::Value::ConstObject& itemObj) { + flushPendingFunctionCalls(""); + std::string callId; + auto callIdIt = itemObj.FindMember("call_id"); + if (callIdIt != itemObj.MemberEnd() && callIdIt->value.IsString()) + callId = callIdIt->value.GetString(); + std::string output; + auto outputIt = itemObj.FindMember("output"); + if (outputIt != itemObj.MemberEnd() && outputIt->value.IsString()) + output = outputIt->value.GetString(); + sink.emitToolMessage(callId, output); + return absl::OkStatus(); + } + + absl::Status onRoleItem(const rapidjson::Value::ConstObject& itemObj, rapidjson::SizeType index) { + const std::string role = itemObj.FindMember("role")->value.GetString(); + std::string contentText; + auto status = sink.extractContent(itemObj, index, contentText); + if (!status.ok()) + return status; + + // Assistant role with buffered function_calls: merge into one message + // (so the tool_calls field rides on the same assistant turn). + if (role == "assistant" && !pendingFunctionCalls.empty()) { + flushPendingFunctionCalls(contentText); + return absl::OkStatus(); + } + // Non-assistant items must not absorb pending tool_calls; flush first. + // (flushPendingFunctionCalls also emits any standalone reasoning content + // as a standalone assistant turn.) + if (role != "assistant") { + flushPendingFunctionCalls(""); + } + + std::string reasoning; + if (role == "assistant" && !pendingReasoningContent.empty()) { + reasoning = std::move(pendingReasoningContent); + pendingReasoningContent.clear(); + } + sink.emitMessage(role, contentText, reasoning); + return absl::OkStatus(); + } + + void flushPendingFunctionCalls(const std::string& assistantText) { + if (pendingFunctionCalls.empty()) { + // No tool calls, but possibly buffered reasoning to flush as a + // standalone assistant turn carrying only reasoning_content (no + // `content` field at all, so templates that gate on `message.content` + // skip the content branch and templates that gate on + // `message.reasoning_content` still see the buffered text). + if (!pendingReasoningContent.empty()) { + std::string reasoning = std::move(pendingReasoningContent); + pendingReasoningContent.clear(); + sink.emitStandaloneReasoning(reasoning); + } + return; + } + std::string reasoning = std::move(pendingReasoningContent); + pendingReasoningContent.clear(); + sink.emitAssistantWithToolCalls(assistantText, reasoning, pendingFunctionCalls); + pendingFunctionCalls.clear(); + } + + Sink& sink; + std::vector pendingFunctionCalls; + std::string pendingReasoningContent; +}; + +// Sink that appends to ov::genai::ChatHistory (used when Python is disabled +// or as the fallback C++ chat-history path). Owns a scratch rapidjson document +// whose allocator backs the tool_calls Values until they are deep-copied into +// a JsonContainer. +class ChatHistorySink { +public: + ChatHistorySink(ov::genai::ChatHistory& chatHistory, ImageHistory& imageHistory, + const std::optional& allowedLocalMediaPath, + const std::optional>& allowedMediaDomains) : + chatHistory(chatHistory), + imageHistory(imageHistory), + allowedLocalMediaPath(allowedLocalMediaPath), + allowedMediaDomains(allowedMediaDomains) { + scratchDoc.SetObject(); + } + + absl::Status extractContent(const rapidjson::Value::ConstObject& itemObj, + rapidjson::SizeType index, std::string& outText) { + outText.clear(); + auto contentIt = itemObj.FindMember("content"); + if (contentIt == itemObj.MemberEnd()) + return absl::OkStatus(); + if (contentIt->value.IsString()) { + outText = contentIt->value.GetString(); + return absl::OkStatus(); + } + if (!contentIt->value.IsArray()) + return absl::InvalidArgumentError("input item content must be a string or array"); + for (const auto& contentItem : contentIt->value.GetArray()) { + if (!contentItem.IsObject()) + return absl::InvalidArgumentError("input content items must be objects"); + auto contentObj = contentItem.GetObject(); + auto typeIt = contentObj.FindMember("type"); + if (typeIt == contentObj.MemberEnd() || !typeIt->value.IsString()) + return absl::InvalidArgumentError("input content item type is missing or invalid"); + const std::string type = typeIt->value.GetString(); + if (type == "input_text" || type == "output_text") { + auto textIt = contentObj.FindMember("text"); + if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) + return absl::InvalidArgumentError(absl::StrCat(type, " requires a valid text field")); + // Last text-bearing item wins, matching pre-refactor behaviour. + outText = textIt->value.GetString(); + } else if (type == "input_image") { + auto status = appendInputImage(contentObj, index); + if (!status.ok()) + return status; + } else { + // Skip unrecognised content item types for forward compatibility. + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Skipping unsupported content type: {}", type); + } + } + return absl::OkStatus(); + } + + void emitToolMessage(const std::string& callId, const std::string& output) { + chatHistory.push_back({}); + chatHistory.last()["role"] = "tool"; + if (!callId.empty()) + chatHistory.last()["tool_call_id"] = callId; + chatHistory.last()["content"] = output; + } + + void emitMessage(const std::string& role, const std::string& contentText, const std::string& reasoning) { + chatHistory.push_back({}); + chatHistory.last()["role"] = role; + chatHistory.last()["content"] = contentText; + if (!reasoning.empty()) + chatHistory.last()["reasoning_content"] = reasoning; + } + + void emitAssistantWithToolCalls(const std::string& contentText, const std::string& reasoning, + const std::vector& toolCalls) { + chatHistory.push_back({}); + chatHistory.last()["role"] = "assistant"; + chatHistory.last()["content"] = contentText; + if (!reasoning.empty()) + chatHistory.last()["reasoning_content"] = reasoning; + auto& alloc = scratchDoc.GetAllocator(); + rapidjson::Value toolCallsArray(rapidjson::kArrayType); + buildToolCallsArray(toolCalls, toolCallsArray, alloc); + // rapidJsonValueToJsonContainer deep-copies, so scratchDoc can be reused. + chatHistory.last()["tool_calls"] = rapidJsonValueToJsonContainer(toolCallsArray); + } + + // Emit an assistant turn that carries only reasoning_content (no content, + // no tool_calls). Used when reasoning is not followed by an assistant or + // function_call item. + void emitStandaloneReasoning(const std::string& reasoning) { + chatHistory.push_back({}); + chatHistory.last()["role"] = "assistant"; + chatHistory.last()["reasoning_content"] = reasoning; + } + + absl::Status onMissingRole(const rapidjson::Value::ConstObject&) { + return absl::InvalidArgumentError("input item role is missing or invalid"); + } + +private: + absl::Status appendInputImage(const rapidjson::Value::ConstObject& contentObj, rapidjson::SizeType index) { + auto imageUrlIt = contentObj.FindMember("image_url"); + if (imageUrlIt == contentObj.MemberEnd()) + return absl::InvalidArgumentError("input_image requires image_url field"); + + std::string imageUrl; + if (imageUrlIt->value.IsString()) { + imageUrl = imageUrlIt->value.GetString(); + } else if (imageUrlIt->value.IsObject()) { + auto imageUrlObj = imageUrlIt->value.GetObject(); + auto urlIt = imageUrlObj.FindMember("url"); + if (urlIt == imageUrlObj.MemberEnd() || !urlIt->value.IsString()) + return absl::InvalidArgumentError("input_image.image_url.url is missing or invalid"); + imageUrl = urlIt->value.GetString(); + } else { + return absl::InvalidArgumentError("input_image.image_url must be a string or object"); + } + + auto tensorResult = loadImage(imageUrl, allowedLocalMediaPath, allowedMediaDomains); + if (!tensorResult.ok()) + return tensorResult.status(); + imageHistory.push_back({index, tensorResult.value()}); + return absl::OkStatus(); + } + + // Build a chat/completions tool_calls[] array into outArr using the given allocator. + static void buildToolCallsArray(const std::vector& toolCalls, + rapidjson::Value& outArr, rapidjson::Document::AllocatorType& alloc) { + for (const auto* fc : toolCalls) { + const FunctionCallFields fields = readFunctionCallFields(*fc); + rapidjson::Value funcObj(rapidjson::kObjectType); + funcObj.AddMember("name", rapidjson::Value(fields.name.c_str(), alloc), alloc); + funcObj.AddMember("arguments", rapidjson::Value(fields.arguments.c_str(), alloc), alloc); + rapidjson::Value tcObj(rapidjson::kObjectType); + tcObj.AddMember("id", rapidjson::Value(fields.id.c_str(), alloc), alloc); + tcObj.AddMember("type", rapidjson::Value("function", alloc), alloc); + tcObj.AddMember("function", funcObj, alloc); + outArr.PushBack(tcObj, alloc); + } + } + + ov::genai::ChatHistory& chatHistory; + ImageHistory& imageHistory; + const std::optional& allowedLocalMediaPath; + const std::optional>& allowedMediaDomains; + rapidjson::Document scratchDoc; +}; + +#if (PYTHON_DISABLE == 0) +// Sink that appends to a rapidjson messages array, used to feed the Python +// Jinja chat template path. Image content items are silently dropped (the +// Python path receives only text). +class ProcessedJsonSink { +public: + ProcessedJsonSink(rapidjson::Value& messagesArray, rapidjson::Document::AllocatorType& alloc) : + messagesArray(messagesArray), + alloc(alloc) {} + + absl::Status extractContent(const rapidjson::Value::ConstObject& itemObj, + rapidjson::SizeType /*index*/, std::string& outText) { + auto contentIt = itemObj.FindMember("content"); + outText = (contentIt != itemObj.MemberEnd()) ? extractTextContent(contentIt->value) : ""; + return absl::OkStatus(); + } + + void emitToolMessage(const std::string& callId, const std::string& output) { + rapidjson::Value msgObj(rapidjson::kObjectType); + msgObj.AddMember("role", rapidjson::Value("tool", alloc), alloc); + if (!callId.empty()) + msgObj.AddMember("tool_call_id", rapidjson::Value(callId.c_str(), alloc), alloc); + msgObj.AddMember("content", rapidjson::Value(output.c_str(), alloc), alloc); + messagesArray.PushBack(msgObj, alloc); + } + + void emitMessage(const std::string& role, const std::string& contentText, const std::string& reasoning) { + rapidjson::Value msgObj(rapidjson::kObjectType); + msgObj.AddMember("role", rapidjson::Value(role.c_str(), alloc), alloc); + msgObj.AddMember("content", rapidjson::Value(contentText.c_str(), alloc), alloc); + if (!reasoning.empty()) + msgObj.AddMember("reasoning_content", rapidjson::Value(reasoning.c_str(), alloc), alloc); + messagesArray.PushBack(msgObj, alloc); + } + + // Emit an assistant turn that carries only reasoning_content (no content, + // no tool_calls). See ChatHistorySink::emitStandaloneReasoning for rationale. + void emitStandaloneReasoning(const std::string& reasoning) { + rapidjson::Value msgObj(rapidjson::kObjectType); + msgObj.AddMember("role", rapidjson::Value("assistant", alloc), alloc); + msgObj.AddMember("reasoning_content", rapidjson::Value(reasoning.c_str(), alloc), alloc); + messagesArray.PushBack(msgObj, alloc); + } + + void emitAssistantWithToolCalls(const std::string& contentText, const std::string& reasoning, + const std::vector& toolCalls) { + rapidjson::Value msgObj(rapidjson::kObjectType); + msgObj.AddMember("role", rapidjson::Value("assistant", alloc), alloc); + msgObj.AddMember("content", rapidjson::Value(contentText.c_str(), alloc), alloc); + if (!reasoning.empty()) + msgObj.AddMember("reasoning_content", rapidjson::Value(reasoning.c_str(), alloc), alloc); + rapidjson::Value toolCallsArray(rapidjson::kArrayType); + for (const auto* fc : toolCalls) { + const FunctionCallFields fields = readFunctionCallFields(*fc); + rapidjson::Value funcObj(rapidjson::kObjectType); + funcObj.AddMember("name", rapidjson::Value(fields.name.c_str(), alloc), alloc); + funcObj.AddMember("arguments", rapidjson::Value(fields.arguments.c_str(), alloc), alloc); + rapidjson::Value tcObj(rapidjson::kObjectType); + tcObj.AddMember("id", rapidjson::Value(fields.id.c_str(), alloc), alloc); + tcObj.AddMember("type", rapidjson::Value("function", alloc), alloc); + tcObj.AddMember("function", funcObj, alloc); + toolCallsArray.PushBack(tcObj, alloc); + } + msgObj.AddMember("tool_calls", toolCallsArray, alloc); + messagesArray.PushBack(msgObj, alloc); + } + + absl::Status onMissingRole(const rapidjson::Value::ConstObject&) { + // Silently skip unknown items without a role in the processed JSON path. + return absl::OkStatus(); + } + +private: + rapidjson::Value& messagesArray; + rapidjson::Document::AllocatorType& alloc; +}; +#endif // PYTHON_DISABLE == 0 + // --- Request parsing --- absl::Status OpenAIResponsesHandler::parseRequest(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength, @@ -87,87 +618,12 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional allow if (inputIt->value.GetArray().Size() == 0) { return absl::InvalidArgumentError("Messages array cannot be empty"); } - - for (size_t i = 0; i < inputIt->value.GetArray().Size(); ++i) { - auto& item = inputIt->value.GetArray()[i]; - if (!item.IsObject()) { - return absl::InvalidArgumentError("input array items must be objects"); - } - - auto itemObj = item.GetObject(); - auto roleIt = itemObj.FindMember("role"); - if (roleIt == itemObj.MemberEnd() || !roleIt->value.IsString()) { - return absl::InvalidArgumentError("input item role is missing or invalid"); - } - - request.chatHistory.push_back({}); - request.chatHistory.last()["role"] = roleIt->value.GetString(); - - auto contentIt = itemObj.FindMember("content"); - if (contentIt == itemObj.MemberEnd()) { - return absl::InvalidArgumentError("input item content is missing"); - } - - if (contentIt->value.IsString()) { - request.chatHistory.last()["content"] = contentIt->value.GetString(); - continue; - } - - if (!contentIt->value.IsArray()) { - return absl::InvalidArgumentError("input item content must be a string or array"); - } - if (contentIt->value.GetArray().Size() == 0) { - return absl::InvalidArgumentError("Invalid message structure - content array is empty"); - } - - std::string contentText = ""; - for (auto& contentItem : contentIt->value.GetArray()) { - if (!contentItem.IsObject()) { - return absl::InvalidArgumentError("input content items must be objects"); - } - auto contentObj = contentItem.GetObject(); - auto typeIt = contentObj.FindMember("type"); - if (typeIt == contentObj.MemberEnd() || !typeIt->value.IsString()) { - return absl::InvalidArgumentError("input content item type is missing or invalid"); - } - - const std::string type = typeIt->value.GetString(); - if (type == "input_text") { - auto textIt = contentObj.FindMember("text"); - if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) { - return absl::InvalidArgumentError("input_text requires a valid text field"); - } - contentText = textIt->value.GetString(); - } else if (type == "input_image") { - std::string imageUrl; - auto imageUrlIt = contentObj.FindMember("image_url"); - if (imageUrlIt == contentObj.MemberEnd()) { - return absl::InvalidArgumentError("input_image requires image_url field"); - } - if (imageUrlIt->value.IsString()) { - imageUrl = imageUrlIt->value.GetString(); - } else if (imageUrlIt->value.IsObject()) { - auto imageUrlObj = imageUrlIt->value.GetObject(); - auto urlIt = imageUrlObj.FindMember("url"); - if (urlIt == imageUrlObj.MemberEnd() || !urlIt->value.IsString()) { - return absl::InvalidArgumentError("input_image.image_url.url is missing or invalid"); - } - imageUrl = urlIt->value.GetString(); - } else { - return absl::InvalidArgumentError("input_image.image_url must be a string or object"); - } - - auto tensorResult = loadImage(imageUrl, allowedLocalMediaPath, allowedMediaDomains); - if (!tensorResult.ok()) { - return tensorResult.status(); - } - request.imageHistory.push_back({i, tensorResult.value()}); - } else { - return absl::InvalidArgumentError("Unsupported content type. Supported types are input_text and input_image."); - } - } - - request.chatHistory.last()["content"] = contentText; + ChatHistorySink sink(request.chatHistory, request.imageHistory, + allowedLocalMediaPath, allowedMediaDomains); + ResponsesInputBuilder builder(sink); + auto status = builder.build(inputIt->value); + if (!status.ok()) { + return status; } } else { return absl::InvalidArgumentError("input is not a string or array"); @@ -189,6 +645,14 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional return absl::InvalidArgumentError("input missing in request"); } + // Convert tools array (Responses-flat -> chat/completions-nested) once, in place, + // before any consumer reads it. parseInput, parseToolsToJsonContainer and the + // processedJson builder all rely on the nested shape. + auto toolsIt = doc.FindMember("tools"); + if (toolsIt != doc.MemberEnd() && toolsIt->value.IsArray()) { + convertResponsesToolsInPlace(toolsIt->value, doc.GetAllocator()); + } + auto messagesStatus = parseInput(allowedLocalMediaPath, allowedMediaDomains); if (!messagesStatus.ok()) { return messagesStatus; @@ -228,33 +692,41 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional } #if (PYTHON_DISABLE == 0) - // Build processedJson with "messages" array from chatHistory so that - // the Python chat template path (which reads request_json["messages"]) - // can consume Responses API input without a separate code path. + // Build processedJson with a "messages" array in chat/completions format so that + // the Python Jinja template path can consume Responses API input without a separate code path. + // Handles reasoning, function_call (merged into assistant tool_calls), and + // function_call_output (converted to role:tool messages). { Document processedDoc; processedDoc.SetObject(); auto& alloc = processedDoc.GetAllocator(); Value messagesArray(kArrayType); - for (size_t i = 0; i < request.chatHistory.size(); ++i) { - Value msgObj(kObjectType); - auto role = request.chatHistory[i]["role"].as_string(); - if (role.has_value()) { - msgObj.AddMember("role", Value(role.value().c_str(), alloc), alloc); - } - auto content = request.chatHistory[i]["content"].as_string(); - if (content.has_value()) { - msgObj.AddMember("content", Value(content.value().c_str(), alloc), alloc); + + auto inputArrIt = doc.FindMember("input"); + if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsArray()) { + ProcessedJsonSink sink(messagesArray, alloc); + ResponsesInputBuilder builder(sink); + auto processedStatus = builder.build(inputArrIt->value); + if (!processedStatus.ok()) { + return processedStatus; } + } else if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsString()) { + // String input: emit a single user message so the Python Jinja path + // sees the same content the C++ chatHistory path does. + Value msgObj(kObjectType); + msgObj.AddMember("role", Value("user", alloc), alloc); + msgObj.AddMember("content", Value(inputArrIt->value.GetString(), alloc), alloc); messagesArray.PushBack(msgObj, alloc); } + processedDoc.AddMember("messages", messagesArray, alloc); - // Copy tools from original doc if present - auto toolsIt = doc.FindMember("tools"); - if (toolsIt != doc.MemberEnd() && !toolsIt->value.IsNull()) { - Value toolsCopy(toolsIt->value, alloc); + // Tools were already normalised to chat/completions nested format by + // convertResponsesToolsInPlace earlier in parseResponsesPart — just copy verbatim. + auto processedToolsIt = doc.FindMember("tools"); + if (processedToolsIt != doc.MemberEnd() && !processedToolsIt->value.IsNull()) { + Value toolsCopy(processedToolsIt->value, alloc); processedDoc.AddMember("tools", toolsCopy, alloc); } diff --git a/src/llm/py_jinja_template_processor.cpp b/src/llm/py_jinja_template_processor.cpp index 432aa8e722..188a3c0daa 100644 --- a/src/llm/py_jinja_template_processor.cpp +++ b/src/llm/py_jinja_template_processor.cpp @@ -40,7 +40,6 @@ bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templ output = "Error: Chat template not loaded correctly, so it cannot be applied"; return false; } - py::gil_scoped_acquire acquire; try { auto locals = py::dict("request_body"_a = requestBody, "chat_template"_a = templateProcessor.chatTemplate->getObject(), diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 5a0955b4f5..d35db8d3b2 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -22,6 +22,7 @@ #pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 4005 4456 6246 6313) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "absl/strings/str_cat.h" #include "mediapipe/framework/calculator_graph.h" #include #include @@ -209,7 +210,7 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrtokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); } catch (const std::exception& e) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + return absl::Status(absl::StatusCode::kInvalidArgument, absl::StrCat("Failed to apply chat template: ", e.what())); } #endif if (inputText.size() == 0) { diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 0ef06d22df..defa1af281 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -105,6 +105,12 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrto_json_string() : std::string("")); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatTemplateKwargs: {}", chatTemplateKwargs.has_value() ? chatTemplateKwargs->to_json_string() : std::string("")); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM addGenerationPrompt: {}", addGenerationPrompt); vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); } else { return absl::InvalidArgumentError("Unsupported endpoint"); diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index c3a40cba3c..2a0ad01005 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -3834,3 +3834,546 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseMessagesRegularMessageHasNoToolFields) EXPECT_FALSE(history[1].contains("tool_call_id")); EXPECT_FALSE(history[1].contains("name")); } + +namespace { +std::shared_ptr parseResponses(rapidjson::Document& doc, ov::genai::Tokenizer& tokenizer, const std::string& json) { + doc.Parse(json.c_str()); + EXPECT_FALSE(doc.HasParseError()) << json; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = std::make_shared( + doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << json; + return apiHandler; +} + +// Variant for negative tests: returns the parseRequest status without asserting +// it is OK, so the caller can verify the failure mode. +absl::Status tryParseResponses(rapidjson::Document& doc, ov::genai::Tokenizer& tokenizer, const std::string& json) { + doc.Parse(json.c_str()); + EXPECT_FALSE(doc.HasParseError()) << json; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = std::make_shared( + doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), tokenizer); + return apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength); +} + +// Assert that parsing the given Responses API request produces a chat history +// (and processedJson, when Python is enabled) equivalent to the expected +// chat/completions request. +// +// The expected JSON is a chat/completions REQUEST body — an object with a +// "messages" array and optionally a "tools" array. This makes each test read as +// "given this Responses input, OVMS should produce this chat/completions +// request" — which is exactly the contract of the Responses-to-chat/completions +// translator. +// +// Comparison is structural via rapidjson Value::operator== (member order inside +// objects is irrelevant). +// +// Both the chat-history path (used in the C++/non-Python build) and the +// processedJson path (used by the Python Jinja template) are checked, so a +// single test pins both downstream consumers. +void expectResponsesEquivalentToChatCompletions(rapidjson::Document& doc, ov::genai::Tokenizer& tokenizer, + const std::string& responsesRequest, const std::string& expectedChatCompletions) { + auto handler = parseResponses(doc, tokenizer, responsesRequest); + + rapidjson::Document expectedDoc; + expectedDoc.Parse(expectedChatCompletions.c_str()); + ASSERT_FALSE(expectedDoc.HasParseError()) + << "could not parse expected chat/completions: " << expectedChatCompletions; + ASSERT_TRUE(expectedDoc.HasMember("messages")) + << "expected chat/completions JSON must contain a 'messages' array"; + + // --- ChatHistory path (C++ / non-Python build) --- + const std::string actualHistoryJson = handler->getChatHistory().get_messages().to_json_string(); + rapidjson::Document actualHistoryDoc; + actualHistoryDoc.Parse(actualHistoryJson.c_str()); + ASSERT_FALSE(actualHistoryDoc.HasParseError()) << actualHistoryJson; + EXPECT_TRUE(actualHistoryDoc == expectedDoc["messages"]) + << "ChatHistory messages mismatch.\n actual: " << actualHistoryJson + << "\n expected: " << expectedChatCompletions; + // Tools on the C++ path are exposed via parseToolsToJsonContainer() — that + // is exactly what the non-Python servable forwards to GenAI. Compare its + // serialised JSON against the expected chat/completions tools. + if (expectedDoc.HasMember("tools")) { + auto toolsStatus = handler->parseToolsToJsonContainer(); + ASSERT_TRUE(toolsStatus.ok()) << "parseToolsToJsonContainer failed: " << toolsStatus.status().message(); + ASSERT_TRUE(toolsStatus.value().has_value()) << "parseToolsToJsonContainer returned nullopt"; + const std::string actualToolsJson = toolsStatus.value()->to_json_string(); + rapidjson::Document actualToolsDoc; + actualToolsDoc.Parse(actualToolsJson.c_str()); + ASSERT_FALSE(actualToolsDoc.HasParseError()) << actualToolsJson; + EXPECT_TRUE(actualToolsDoc == expectedDoc["tools"]) + << "parseToolsToJsonContainer mismatch.\n actual: " << actualToolsJson + << "\n expected: " << expectedChatCompletions; + } + +#if (PYTHON_DISABLE == 0) + // --- processedJson path (Python Jinja chat template) --- + const std::string actualProcessedJson = handler->getProcessedJson(); + rapidjson::Document actualProcessedDoc; + actualProcessedDoc.Parse(actualProcessedJson.c_str()); + ASSERT_FALSE(actualProcessedDoc.HasParseError()) << actualProcessedJson; + ASSERT_TRUE(actualProcessedDoc.HasMember("messages")) << actualProcessedJson; + EXPECT_TRUE(actualProcessedDoc["messages"] == expectedDoc["messages"]) + << "processedJson messages mismatch.\n actual: " << actualProcessedJson + << "\n expected: " << expectedChatCompletions; + if (expectedDoc.HasMember("tools")) { + ASSERT_TRUE(actualProcessedDoc.HasMember("tools")) << actualProcessedJson; + EXPECT_TRUE(actualProcessedDoc["tools"] == expectedDoc["tools"]) + << "processedJson tools mismatch.\n actual: " << actualProcessedJson + << "\n expected: " << expectedChatCompletions; + } +#endif +} +} // namespace + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolsNormaliseToChatCompletions) { + // Responses-flat tools shape ({type, name, parameters}) must be rewritten + // to chat/completions nested shape ({type, function:{...}}) before the + // request is forwarded to the chat template. Input is given as an array so + // both ChatHistory and processedJson sinks populate the messages array. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [{"role":"user","content":[{"type":"input_text","text":"hello"}]}], + "tools": [{ + "type": "function", + "name": "get_weather", + "description": "Get current weather", + "parameters": {"type":"object","properties":{"city":{"type":"string"}},"required":["city"]} + }] + })", + R"({ + "messages": [{"role":"user","content":"hello"}], + "tools": [{ + "type":"function", + "function":{ + "name":"get_weather", + "description":"Get current weather", + "parameters":{"type":"object","properties":{"city":{"type":"string"}},"required":["city"]} + } + }] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesAlreadyNestedToolsAreLeftIntact) { + // Tools already in chat/completions nested shape must pass through without + // double-wrapping. This is asserted directly on the (in-place mutated) + // request document because the equivalence helper would not detect a + // spurious unwrap+rewrap that nets to the same shape. + std::string json = R"({ + "model": "llama", + "input": "hello", + "tools": [{ + "type": "function", + "function": { + "name": "get_weather", + "parameters": {"type":"object","properties":{"city":{"type":"string"}},"required":["city"]} + } + }] + })"; + auto apiHandler = parseResponses(doc, *tokenizer, json); + EXPECT_TRUE(apiHandler->areToolsAvailable()); + ASSERT_TRUE(doc["tools"][0].HasMember("function")); + EXPECT_STREQ(doc["tools"][0]["function"]["name"].GetString(), "get_weather"); + EXPECT_FALSE(doc["tools"][0]["function"].HasMember("function")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesReasoningBufferedOntoNextAssistantMessage) { + // A bare reasoning item, then an assistant message: the reasoning text + // rides on the next assistant message as reasoning_content and does NOT + // produce its own message. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"hi"}]}, + {"type": "reasoning", "summary": [{"type":"summary_text","text":"think first"}]}, + {"role": "assistant", "content": [{"type":"output_text","text":"hello"}]} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"hi"}, + {"role":"assistant","content":"hello","reasoning_content":"think first"} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesStandaloneReasoningWithoutAssistantIsEmitted) { + // Reasoning followed by a non-assistant/non-function_call item is flushed + // as a standalone assistant turn carrying ONLY reasoning_content (no + // `content`, no `tool_calls`). This preserves the chain-of-thought across + // turns even when the prior turn produced no visible output. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"hi"}]}, + {"type": "reasoning", "summary": [{"type":"summary_text","text":"orphan"}]}, + {"role": "user", "content": [{"type":"input_text","text":"again"}]} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"hi"}, + {"role":"assistant","reasoning_content":"orphan"}, + {"role":"user","content":"again"} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesTrailingStandaloneReasoningIsEmitted) { + // Input ending with a reasoning item — the buffered reasoning is flushed + // as a trailing standalone assistant turn rather than silently lost. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"hi"}]}, + {"type": "reasoning", "summary": [{"type":"summary_text","text":"trailing"}]} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"hi"}, + {"role":"assistant","reasoning_content":"trailing"} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFunctionCallMergedIntoAssistantToolCalls) { + // function_call followed by function_call_output should produce: + // user -> assistant(content="", tool_calls=[...]) -> tool(tool_call_id=...) + // The synthesised assistant message MUST own a tool_calls field; otherwise + // gpt-oss raises "Message has tool role, but there was no previous + // assistant message with a tool call!". + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"weather?"}]}, + {"type": "function_call", "id": "call_1", "call_id": "call_1", + "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}, + {"type": "function_call_output", "call_id": "call_1", + "output": "{\"temp_c\":17}"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"weather?"}, + {"role":"assistant","content":"","tool_calls":[ + {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} + ]}, + {"role":"tool","tool_call_id":"call_1","content":"{\"temp_c\":17}"} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesReasoningPlusFunctionCallRidesOnAssistant) { + // reasoning + function_call should both attach to the synthesised assistant + // turn that owns the tool_calls. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"weather?"}]}, + {"type": "reasoning", "summary": [{"type":"summary_text","text":"need to call get_weather"}]}, + {"type": "function_call", "id": "call_1", "call_id": "call_1", + "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}, + {"type": "function_call_output", "call_id": "call_1", "output": "ok"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"weather?"}, + {"role":"assistant","content":"","reasoning_content":"need to call get_weather","tool_calls":[ + {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} + ]}, + {"role":"tool","tool_call_id":"call_1","content":"ok"} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesMultipleFunctionCallsMergedInOneAssistant) { + // Two function_calls back-to-back must produce a single assistant message + // with two entries in tool_calls, not two assistant turns. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"weather?"}]}, + {"type": "function_call", "id": "call_1", "call_id": "call_1", + "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}, + {"type": "function_call", "id": "call_2", "call_id": "call_2", + "name": "get_weather", "arguments": "{\"city\":\"London\"}"}, + {"type": "function_call_output", "call_id": "call_1", "output": "15C"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"weather?"}, + {"role":"assistant","content":"","tool_calls":[ + {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}}, + {"id":"call_2","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"London\"}"}} + ]}, + {"role":"tool","tool_call_id":"call_1","content":"15C"} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesTrailingFunctionCallFlushedAsAssistant) { + // Input ending with a function_call (no matching output) — the trailing + // function_call must still be flushed as an assistant turn rather than + // silently lost. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"weather?"}]}, + {"type": "function_call", "id": "call_1", "call_id": "call_1", + "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"weather?"}, + {"role":"assistant","content":"","tool_calls":[ + {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} + ]} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesAssistantMessageAbsorbsBufferedFunctionCall) { + // If an assistant role item follows a function_call, its text content + // should ride on the same merged message (assistant-with-tool_calls), not + // produce a second assistant turn. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"weather?"}]}, + {"type": "function_call", "id": "call_1", "call_id": "call_1", + "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}, + {"role": "assistant", "content": "calling tool"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"weather?"}, + {"role":"assistant","content":"calling tool","tool_calls":[ + {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} + ]} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesReasoningContentArrayShapeAccepted) { + // The newer reasoning shape: content[].text instead of summary[].text. + // OVMS accepts both and produces the same chat/completions output. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"hi"}]}, + {"type": "reasoning", "content": [{"type":"reasoning_text","text":"new shape"}]}, + {"role": "assistant", "content": "ok"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"hi"}, + {"role":"assistant","content":"ok","reasoning_content":"new shape"} + ] + })"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFunctionCallOutputWithoutCallIdAccepted) { + // function_call_output without call_id: the resulting tool message has no + // tool_call_id field rather than failing parsing or carrying an empty id. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"weather?"}]}, + {"type": "function_call", "id": "call_1", "call_id": "call_1", + "name": "get_weather", "arguments": "{}"}, + {"type": "function_call_output", "output": "ok"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"weather?"}, + {"role":"assistant","content":"","tool_calls":[ + {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{}"}} + ]}, + {"role":"tool","content":"ok"} + ] + })"); +} + +// --- Tools normalisation edge cases --- + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolWithoutParametersIsNormalised) { + // Flat Responses tools may omit `parameters` for zero-arg functions. The + // nested form should still be produced (with no `parameters` key under + // function), not fail or fabricate one. Input is given as an array so + // both ChatHistory and processedJson sinks populate the messages array. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [{"role":"user","content":[{"type":"input_text","text":"hello"}]}], + "tools": [{"type": "function", "name": "ping", "description": "no args"}] + })", + R"({ + "messages": [{"role":"user","content":"hello"}], + "tools": [{"type":"function","function":{"name":"ping","description":"no args"}}] + })"); +} + +// --- Error paths --- + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesInputItemMissingRoleIsRejected) { + // An input item with no recognised `type` and no `role` cannot be + // classified — the chat-history sink must surface this as an + // InvalidArgumentError rather than silently dropping the turn. + std::string json = R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"hi"}]}, + {"content": [{"type":"output_text","text":"orphaned"}]} + ] + })"; + auto status = tryParseResponses(doc, *tokenizer, json); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(std::string(status.message()), ::testing::HasSubstr("role")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesInputContentNotStringOrArrayIsRejected) { + std::string json = R"({ + "model": "llama", + "input": [ + {"role": "user", "content": 42} + ] + })"; + auto status = tryParseResponses(doc, *tokenizer, json); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(std::string(status.message()), ::testing::HasSubstr("content")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesInputContentItemMissingTypeIsRejected) { + std::string json = R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"text":"no type field"}]} + ] + })"; + auto status = tryParseResponses(doc, *tokenizer, json); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(std::string(status.message()), ::testing::HasSubstr("type")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesInputTextMissingTextFieldIsRejected) { + std::string json = R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text"}]} + ] + })"; + auto status = tryParseResponses(doc, *tokenizer, json); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(std::string(status.message()), ::testing::HasSubstr("text")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesInputArrayItemNotObjectIsRejected) { + std::string json = R"({ + "model": "llama", + "input": ["not an object"] + })"; + auto status = tryParseResponses(doc, *tokenizer, json); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_THAT(std::string(status.message()), ::testing::HasSubstr("must be objects")); +} + +// --- Multi-turn composite --- + +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesMultiTurnReasoningFunctionCallAndFollowupAssistant) { + // End-to-end: user -> reasoning + function_call (merged on synthesised + // assistant) -> function_call_output -> reasoning + assistant final answer. + // Validates that buffering state is correctly reset between turns. + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": [{"type":"input_text","text":"weather in Paris?"}]}, + {"type": "reasoning", "summary": [{"type":"summary_text","text":"need to call get_weather"}]}, + {"type": "function_call", "id": "call_1", "call_id": "call_1", + "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}, + {"type": "function_call_output", "call_id": "call_1", "output": "sunny, 22C"}, + {"type": "reasoning", "summary": [{"type":"summary_text","text":"format the answer"}]}, + {"role": "assistant", "content": [{"type":"output_text","text":"It is sunny and 22C in Paris."}]} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"weather in Paris?"}, + {"role":"assistant","content":"","reasoning_content":"need to call get_weather","tool_calls":[ + {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} + ]}, + {"role":"tool","tool_call_id":"call_1","content":"sunny, 22C"}, + {"role":"assistant","content":"It is sunny and 22C in Paris.","reasoning_content":"format the answer"} + ] + })"); +} + +// Real BFCL replay shape: between every function_call and its function_call_output +// the OpenAI SDK echoes back the empty assistant message that ovms returned in +// `output[]`. With multiple turns this looks like: +// user -> fc1 -> {id:msg-0,role:assistant,type:message,content:[{type:output_text,text:""}]} +// -> fco1 -> fc2 -> msg-0 -> fco2 -> ... +// The 4th request OVMS sees while running BFCL multi_turn_base_0 reports 128 +// MORE input_tokens on /responses than the equivalent /chat/completions call, +// even though the message lists are structurally equivalent. This test +// reproduces the exact shape so processedJson can be compared head-to-head. +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesBfclReplayShapeWithEchoedAssistantMessages) { + expectResponsesEquivalentToChatCompletions(doc, *tokenizer, + R"({ + "model": "llama", + "input": [ + {"role": "user", "content": "do work"}, + {"type": "function_call", "id": "fc1", "call_id": "fc1", + "name": "mkdir", "arguments": "{\"dir_name\":\"temp\"}", + "namespace": null, "status": "completed"}, + {"id": "msg-0", "type": "message", "role": "assistant", "status": "completed", + "content": [{"type": "output_text", "text": "", "annotations": [], "logprobs": null}], + "phase": null}, + {"type": "function_call_output", "call_id": "fc1", "output": "None"}, + {"type": "function_call", "id": "fc2", "call_id": "fc2", + "name": "mv", "arguments": "{\"source\":\"a\",\"destination\":\"temp\"}", + "namespace": null, "status": "completed"}, + {"id": "msg-0", "type": "message", "role": "assistant", "status": "completed", + "content": [{"type": "output_text", "text": "", "annotations": [], "logprobs": null}], + "phase": null}, + {"type": "function_call_output", "call_id": "fc2", "output": "{\"error\":\"no\"}"} + ] + })", + R"({ + "messages": [ + {"role":"user","content":"do work"}, + {"role":"assistant","content":"","tool_calls":[ + {"id":"fc1","type":"function","function":{"name":"mkdir","arguments":"{\"dir_name\":\"temp\"}"}} + ]}, + {"role":"tool","tool_call_id":"fc1","content":"None"}, + {"role":"assistant","content":"","tool_calls":[ + {"id":"fc2","type":"function","function":{"name":"mv","arguments":"{\"source\":\"a\",\"destination\":\"temp\"}"}} + ]}, + {"role":"tool","tool_call_id":"fc2","content":"{\"error\":\"no\"}"} + ] + })"); +}