From 9a8edc5b2e3913feae2ccb969d7d892427cb8d48 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 26 Feb 2026 16:13:32 +0100 Subject: [PATCH 01/24] Responses api init --- src/http_rest_api_handler.cpp | 2 +- src/llm/apis/openai_completions.cpp | 789 ++++++++++++++++++ src/llm/apis/openai_completions.hpp | 5 + .../continuous_batching/servable.cpp | 15 +- src/llm/servable.cpp | 56 +- .../continuous_batching/servable.cpp | 6 +- .../visual_language_model/legacy/servable.cpp | 6 +- src/test/http_openai_handler_test.cpp | 526 ++++++++++++ .../complete_flow_test.cpp | 191 +++++ 9 files changed, 1587 insertions(+), 9 deletions(-) diff --git a/src/http_rest_api_handler.cpp b/src/http_rest_api_handler.cpp index 33a81cb429..c295edab87 100644 --- a/src/http_rest_api_handler.cpp +++ b/src/http_rest_api_handler.cpp @@ -531,7 +531,7 @@ static Status createV3HttpPayload( return Status(StatusCode::JSON_INVALID, "model field is not a string"); } - bool isTextGenerationEndpoint = uri.find("completions") != std::string_view::npos; + bool isTextGenerationEndpoint = (uri.find("completions") != std::string_view::npos) || (uri.find("responses") != std::string_view::npos); if (isTextGenerationEndpoint) { auto streamIt = parsedJson->FindMember("stream"); if (streamIt != parsedJson->MemberEnd()) { diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index f8afc07134..afcd8daf8e 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -17,6 +17,7 @@ #include "openai_completions.hpp" #include +#include #include #include #include @@ -95,6 +96,328 @@ ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& v throw std::invalid_argument("Unsupported JSON value type"); } +std::string serializeResponsesUnaryResponse( + const std::vector& parsedOutputs, + const CompletionUsageStatistics& usage, + const OpenAIChatCompletionsRequest& request, + const ToolsSchemas_t& toolNameSchemaMap, + std::chrono::time_point created) { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + + auto serializeResponsesToolChoice = [&request](Writer& writer) { + writer.String("tool_choice"); + if (request.toolChoice.empty()) { + writer.String("auto"); + } else if (request.toolChoice == "auto" || request.toolChoice == "none" || request.toolChoice == "required") { + writer.String(request.toolChoice.c_str()); + } else { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(request.toolChoice.c_str()); + writer.EndObject(); + } + }; + + auto serializeResponsesTools = [&toolNameSchemaMap](Writer& writer) { + writer.String("tools"); + writer.StartArray(); + for (const auto& [toolName, toolSchemaWrapper] : toolNameSchemaMap) { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(toolName.c_str()); + writer.String("parameters"); + writer.RawValue(toolSchemaWrapper.stringRepr.c_str(), toolSchemaWrapper.stringRepr.size(), rapidjson::kObjectType); + writer.EndObject(); + } + writer.EndArray(); + }; + + StringBuffer buffer; + Writer writer(buffer); + + writer.StartObject(); + writer.String("id"); + writer.String(responseId.c_str()); + writer.String("object"); + writer.String("response"); + writer.String("created_at"); + writer.Int64(createdAt); + writer.String("completed_at"); + writer.Int64(createdAt); + writer.String("model"); + writer.String(request.model.c_str()); + writer.String("status"); + writer.String("completed"); + + writer.String("parallel_tool_calls"); + writer.Bool(false); + serializeResponsesToolChoice(writer); + serializeResponsesTools(writer); + + if (request.maxTokens.has_value()) { + writer.String("max_output_tokens"); + writer.Uint64(static_cast(request.maxTokens.value())); + } + + writer.String("output"); + writer.StartArray(); + int outputIndex = 0; + for (const auto& parsedOutput : parsedOutputs) { + const std::string outputId = "msg-" + std::to_string(outputIndex++); + + writer.StartObject(); + writer.String("id"); + writer.String(outputId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String("completed"); + writer.String("content"); + writer.StartArray(); + writer.StartObject(); + writer.String("type"); + writer.String("output_text"); + writer.String("text"); + writer.String(parsedOutput.content.c_str()); + writer.String("annotations"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + writer.EndArray(); + writer.EndObject(); + } + writer.EndArray(); + + writer.String("usage"); + writer.StartObject(); + writer.String("input_tokens"); + writer.Uint64(static_cast(usage.promptTokens)); + writer.String("input_tokens_details"); + writer.StartObject(); + writer.String("cached_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("output_tokens"); + writer.Uint64(static_cast(usage.completionTokens)); + writer.String("output_tokens_details"); + writer.StartObject(); + writer.String("reasoning_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("total_tokens"); + writer.Uint64(static_cast(usage.calculateTotalTokens())); + writer.EndObject(); + + writer.EndObject(); + + return buffer.GetString(); +} + +absl::Status normalizeResponsesFunctionToolsInPlace(rapidjson::Document& doc) { + auto toolsIt = doc.FindMember("tools"); + if (toolsIt == doc.MemberEnd() || toolsIt->value.IsNull()) { + return absl::OkStatus(); + } + if (!toolsIt->value.IsArray()) { + return absl::InvalidArgumentError("Tools are not an array"); + } + + auto& allocator = doc.GetAllocator(); + for (auto& toolValue : toolsIt->value.GetArray()) { + if (!toolValue.IsObject()) { + return absl::InvalidArgumentError("Tool is not a JSON object"); + } + auto toolObj = toolValue.GetObject(); + auto typeIt = toolObj.FindMember("type"); + if (typeIt == toolObj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("Tool type is missing or invalid"); + } + if (std::string(typeIt->value.GetString()) != "function") { + return absl::InvalidArgumentError("Only function tools are supported"); + } + + auto functionIt = toolObj.FindMember("function"); + if (functionIt != toolObj.MemberEnd()) { + if (!functionIt->value.IsObject()) { + return absl::InvalidArgumentError("Function is not a valid JSON object"); + } + continue; + } + + auto nameIt = toolObj.FindMember("name"); + if (nameIt == toolObj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("Function object does not contain a valid name field"); + } + + rapidjson::Value functionObj(rapidjson::kObjectType); + functionObj.AddMember("name", rapidjson::Value(nameIt->value.GetString(), allocator), allocator); + + auto descriptionIt = toolObj.FindMember("description"); + if (descriptionIt != toolObj.MemberEnd() && descriptionIt->value.IsString()) { + functionObj.AddMember("description", rapidjson::Value(descriptionIt->value.GetString(), allocator), allocator); + } + + auto parametersIt = toolObj.FindMember("parameters"); + if (parametersIt != toolObj.MemberEnd()) { + if (!parametersIt->value.IsObject()) { + return absl::InvalidArgumentError("Function parameters are not a valid JSON object"); + } + rapidjson::Value parametersCopy(rapidjson::kObjectType); + parametersCopy.CopyFrom(parametersIt->value, allocator); + functionObj.AddMember("parameters", parametersCopy, allocator); + } + + toolValue.AddMember("function", functionObj, allocator); + } + + auto toolChoiceIt = doc.FindMember("tool_choice"); + if (toolChoiceIt != doc.MemberEnd() && !toolChoiceIt->value.IsNull() && toolChoiceIt->value.IsObject()) { + auto toolChoiceObj = toolChoiceIt->value.GetObject(); + auto functionIt = toolChoiceObj.FindMember("function"); + if (functionIt == toolChoiceObj.MemberEnd()) { + auto typeIt = toolChoiceObj.FindMember("type"); + auto nameIt = toolChoiceObj.FindMember("name"); + if (typeIt != toolChoiceObj.MemberEnd() && typeIt->value.IsString() && std::string(typeIt->value.GetString()) == "function") { + if (nameIt == toolChoiceObj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("tool_choice.name is not a valid string"); + } + + rapidjson::Value functionObj(rapidjson::kObjectType); + functionObj.AddMember("name", rapidjson::Value(nameIt->value.GetString(), allocator), allocator); + toolChoiceIt->value.AddMember("function", functionObj, allocator); + } + } + } + + return absl::OkStatus(); +} + +absl::Status normalizeResponsesInputToMessagesInPlace(rapidjson::Document& doc) { + auto inputIt = doc.FindMember("input"); + if (inputIt == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + auto& allocator = doc.GetAllocator(); + if (inputIt->value.IsString()) { + rapidjson::Value messages(rapidjson::kArrayType); + rapidjson::Value messageObj(rapidjson::kObjectType); + messageObj.AddMember("role", "user", allocator); + messageObj.AddMember("content", rapidjson::Value(inputIt->value.GetString(), allocator), allocator); + messages.PushBack(messageObj, allocator); + + auto existingMessages = doc.FindMember("messages"); + if (existingMessages != doc.MemberEnd()) { + existingMessages->value = messages; + } else { + doc.AddMember("messages", messages, allocator); + } + return absl::OkStatus(); + } + if (!inputIt->value.IsArray()) { + return absl::InvalidArgumentError("input is not a string or array"); + } + + rapidjson::Value messages(rapidjson::kArrayType); + for (auto& item : inputIt->value.GetArray()) { + if (!item.IsObject()) { + return absl::InvalidArgumentError("input array items must be objects"); + } + + auto itemObj = item.GetObject(); + auto roleIt = itemObj.FindMember("role"); + if (roleIt == itemObj.MemberEnd() || !roleIt->value.IsString()) { + return absl::InvalidArgumentError("input item role is missing or invalid"); + } + + rapidjson::Value messageObj(rapidjson::kObjectType); + messageObj.AddMember("role", rapidjson::Value(roleIt->value.GetString(), allocator), allocator); + + auto contentIt = itemObj.FindMember("content"); + if (contentIt == itemObj.MemberEnd()) { + return absl::InvalidArgumentError("input item content is missing"); + } + + if (contentIt->value.IsString()) { + messageObj.AddMember("content", rapidjson::Value(contentIt->value.GetString(), allocator), allocator); + messages.PushBack(messageObj, allocator); + continue; + } + + if (!contentIt->value.IsArray()) { + return absl::InvalidArgumentError("input item content must be a string or array"); + } + + rapidjson::Value normalizedContent(rapidjson::kArrayType); + for (auto& contentItem : contentIt->value.GetArray()) { + if (!contentItem.IsObject()) { + return absl::InvalidArgumentError("input content items must be objects"); + } + auto contentObj = contentItem.GetObject(); + auto typeIt = contentObj.FindMember("type"); + if (typeIt == contentObj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("input content item type is missing or invalid"); + } + + std::string type = typeIt->value.GetString(); + if (type == "input_text") { + auto textIt = contentObj.FindMember("text"); + if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) { + return absl::InvalidArgumentError("input_text requires a valid text field"); + } + rapidjson::Value textObj(rapidjson::kObjectType); + textObj.AddMember("type", "text", allocator); + textObj.AddMember("text", rapidjson::Value(textIt->value.GetString(), allocator), allocator); + normalizedContent.PushBack(textObj, allocator); + } else if (type == "input_image") { + std::string imageUrl; + auto imageUrlIt = contentObj.FindMember("image_url"); + if (imageUrlIt == contentObj.MemberEnd()) { + return absl::InvalidArgumentError("input_image requires image_url field"); + } + if (imageUrlIt->value.IsString()) { + imageUrl = imageUrlIt->value.GetString(); + } else if (imageUrlIt->value.IsObject()) { + auto imageUrlObj = imageUrlIt->value.GetObject(); + auto urlIt = imageUrlObj.FindMember("url"); + if (urlIt == imageUrlObj.MemberEnd() || !urlIt->value.IsString()) { + return absl::InvalidArgumentError("input_image.image_url.url is missing or invalid"); + } + imageUrl = urlIt->value.GetString(); + } else { + return absl::InvalidArgumentError("input_image.image_url must be a string or object"); + } + + rapidjson::Value imageUrlObj(rapidjson::kObjectType); + imageUrlObj.AddMember("url", rapidjson::Value(imageUrl.c_str(), allocator), allocator); + + rapidjson::Value imageObj(rapidjson::kObjectType); + imageObj.AddMember("type", "image_url", allocator); + imageObj.AddMember("image_url", imageUrlObj, allocator); + normalizedContent.PushBack(imageObj, allocator); + } else { + return absl::InvalidArgumentError("Unsupported content type"); + } + } + messageObj.AddMember("content", normalizedContent, allocator); + messages.PushBack(messageObj, allocator); + } + + auto existingMessages = doc.FindMember("messages"); + if (existingMessages != doc.MemberEnd()) { + existingMessages->value = messages; + } else { + doc.AddMember("messages", messages, allocator); + } + return absl::OkStatus(); +} + } // namespace absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { @@ -670,6 +993,120 @@ absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(std::optiona return absl::OkStatus(); } +absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { + // input: string; required + auto it = doc.FindMember("input"); + if (it == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + + auto normalizeInputStatus = normalizeResponsesInputToMessagesInPlace(doc); + if (!normalizeInputStatus.ok()) { + return normalizeInputStatus; + } + + it = doc.FindMember("input"); + if (it == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + + if (it->value.IsString()) { + request.prompt = it->value.GetString(); + if (!request.prompt.has_value() || !request.prompt.value().size()) { + return absl::InvalidArgumentError("input cannot be empty"); + } + } + + auto messagesStatus = parseMessages(allowedLocalMediaPath, allowedMediaDomains); + if (!messagesStatus.ok()) { + return messagesStatus; + } + + // logprobs: bool; optional - defaults to false + it = doc.FindMember("logprobs"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsBool()) + return absl::InvalidArgumentError("logprobs accepts values true or false"); + request.logprobschat = it->value.GetBool(); + } + if (request.logprobschat && request.stream) { + return absl::InvalidArgumentError("logprobs are not supported in streaming mode."); + } + + auto toolsStatus = normalizeResponsesFunctionToolsInPlace(doc); + if (!toolsStatus.ok()) { + return toolsStatus; + } + toolsStatus = parseTools(); + if (!toolsStatus.ok()) { + return toolsStatus; + } + + std::optional maxCompletionTokens; + std::optional maxOutputTokens; + + // max_completion_tokens: uint; optional + it = doc.FindMember("max_completion_tokens"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsUint()) { + if (it->value.IsUint64()) + return absl::InvalidArgumentError("max_completion_tokens value can't be greater than 4294967295"); + return absl::InvalidArgumentError("max_completion_tokens is not an unsigned integer"); + } + if (maxTokensLimit.has_value() && it->value.GetUint() > maxTokensLimit.value()) + return absl::InvalidArgumentError(absl::StrCat("max_completion_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); + maxCompletionTokens = it->value.GetUint(); + } + + // max_output_tokens: uint; optional + // OpenAI Responses API uses this field for output token limit. + it = doc.FindMember("max_output_tokens"); + if (it != doc.MemberEnd() && !it->value.IsNull()) { + if (!it->value.IsUint()) { + if (it->value.IsUint64()) + return absl::InvalidArgumentError("max_output_tokens value can't be greater than 4294967295"); + return absl::InvalidArgumentError("max_output_tokens is not an unsigned integer"); + } + if (maxTokensLimit.has_value() && it->value.GetUint() > maxTokensLimit.value()) + return absl::InvalidArgumentError(absl::StrCat("max_output_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); + maxOutputTokens = it->value.GetUint(); + } + + if (maxCompletionTokens.has_value() && maxOutputTokens.has_value() && maxCompletionTokens.value() != maxOutputTokens.value()) { + return absl::InvalidArgumentError("max_output_tokens and max_completion_tokens must match when both are provided"); + } + if (maxOutputTokens.has_value()) { + request.maxTokens = maxOutputTokens.value(); + } else if (maxCompletionTokens.has_value()) { + request.maxTokens = maxCompletionTokens.value(); + } + + // specific part of max_tokens validation + if (request.maxTokens == 0) { + return absl::InvalidArgumentError("max_tokens value should be greater than 0"); + } + + // parse response_format + it = doc.FindMember("response_format"); + if (it != doc.MemberEnd()) { + if (it->value.IsNull()) + return absl::OkStatus(); + if (!it->value.IsObject()) + return absl::InvalidArgumentError("response_format is not an object"); + const rapidjson::Value& responseFormat = it->value; + request.responseFormat = convertOpenAIResponseFormatToStructuralTagStringFormat(responseFormat); + } + + { + StringBuffer buffer; + Writer writer(buffer); + doc.Accept(writer); + request.processedJson = buffer.GetString(); + } + + return absl::OkStatus(); +} + absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength) { OVMS_PROFILE_FUNCTION(); // stream: bool; optional @@ -951,6 +1388,8 @@ absl::Status OpenAIChatCompletionsHandler::parseRequest(std::optional return status; if (endpoint == Endpoint::COMPLETIONS) status = parseCompletionsPart(); + else if (endpoint == Endpoint::RESPONSES) + status = parseResponsesPart(maxTokensLimit, allowedLocalMediaPath, allowedMediaDomains); else status = parseChatCompletionsPart(maxTokensLimit, allowedLocalMediaPath, allowedMediaDomains); @@ -1001,6 +1440,16 @@ ParsedOutput OpenAIChatCompletionsHandler::parseOutputIfNeeded(const std::vector std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vector& generationOutputs) { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + usage.completionTokens = 0; + for (const ov::genai::GenerationOutput& generationOutput : generationOutputs) { + updateUsage(usage, generationOutput.generated_ids, request.echo); + parsedOutputs.push_back(parseOutputIfNeeded(generationOutput.generated_ids)); + } + return serializeResponsesUnaryResponse(parsedOutputs, usage, request, request.toolNameSchemaMap, created); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1126,6 +1575,15 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + for (const auto& tokens : results.tokens) { + updateUsage(usage, tokens, request.echo); + parsedOutputs.push_back(parseOutputIfNeeded(tokens)); + } + return serializeResponsesUnaryResponse(parsedOutputs, usage, request, request.toolNameSchemaMap, created); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1186,6 +1644,27 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (endpoint == Endpoint::RESPONSES) { + std::vector parsedOutputs; + usage.completionTokens = 0; + for (const std::string& text : results.texts) { + auto result = tokenizer.encode(text); + auto& input_ids = result.input_ids; + if (input_ids.get_shape().size() != 2) + throw std::runtime_error("input_ids should have 2 dimensions"); + if (input_ids.get_shape()[0] != 1) + throw std::runtime_error("input_ids should have 1 batch size"); + if (input_ids.get_element_type() != ov::element::i64) + throw std::runtime_error("input_ids should have i64 element type"); + + int64_t* input_ids_data = reinterpret_cast(input_ids.data()); + std::vector generatedTokens(input_ids_data, input_ids_data + input_ids.get_shape()[1]); + updateUsage(usage, generatedTokens, request.echo); + parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens)); + } + return serializeResponsesUnaryResponse(parsedOutputs, usage, request, request.toolNameSchemaMap, created); + } + OpenAiJsonResponse jsonResponse; jsonResponse.StartObject(); @@ -1262,6 +1741,313 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + const std::string outputItemId = "msg-0"; + + auto serializeResponsesToolChoice = [this](Writer& writer) { + writer.String("tool_choice"); + if (request.toolChoice.empty()) { + writer.String("auto"); + } else if (request.toolChoice == "auto" || request.toolChoice == "none" || request.toolChoice == "required") { + writer.String(request.toolChoice.c_str()); + } else { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(request.toolChoice.c_str()); + writer.EndObject(); + } + }; + + auto serializeResponsesTools = [this](Writer& writer) { + writer.String("tools"); + writer.StartArray(); + for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(toolName.c_str()); + writer.String("parameters"); + writer.RawValue(toolSchemaWrapper.stringRepr.c_str(), toolSchemaWrapper.stringRepr.size(), rapidjson::kObjectType); + writer.EndObject(); + } + writer.EndArray(); + }; + + auto serializeResponseObject = [this, &responseId, createdAt, &serializeResponsesToolChoice, &serializeResponsesTools](Writer& writer, const char* status, const std::string& fullOutputText, bool includeUsage) { + writer.StartObject(); + writer.String("id"); + writer.String(responseId.c_str()); + writer.String("object"); + writer.String("response"); + writer.String("created_at"); + writer.Int64(createdAt); + if (std::string(status) == "completed") { + writer.String("completed_at"); + writer.Int64(createdAt); + } + writer.String("model"); + writer.String(request.model.c_str()); + writer.String("status"); + writer.String(status); + + writer.String("parallel_tool_calls"); + writer.Bool(false); + serializeResponsesToolChoice(writer); + serializeResponsesTools(writer); + + if (request.maxTokens.has_value()) { + writer.String("max_output_tokens"); + writer.Uint64(static_cast(request.maxTokens.value())); + } + + writer.String("output"); + writer.StartArray(); + if (!fullOutputText.empty()) { + writer.StartObject(); + writer.String("id"); + writer.String("msg-0"); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(std::string(status) == "completed" ? "completed" : "in_progress"); + writer.String("content"); + writer.StartArray(); + writer.StartObject(); + writer.String("type"); + writer.String("output_text"); + writer.String("text"); + writer.String(fullOutputText.c_str()); + writer.String("annotations"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + writer.EndArray(); + writer.EndObject(); + } + writer.EndArray(); + + if (includeUsage) { + writer.String("usage"); + writer.StartObject(); + writer.String("input_tokens"); + writer.Uint64(static_cast(usage.promptTokens)); + writer.String("input_tokens_details"); + writer.StartObject(); + writer.String("cached_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("output_tokens"); + writer.Uint64(static_cast(usage.completionTokens)); + writer.String("output_tokens_details"); + writer.StartObject(); + writer.String("reasoning_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("total_tokens"); + writer.Uint64(static_cast(usage.calculateTotalTokens())); + writer.EndObject(); + } + + writer.EndObject(); + }; + + auto serializeOutputItem = [&outputItemId](Writer& writer, const std::string& text, const char* status, bool withContent) { + writer.StartObject(); + writer.String("id"); + writer.String(outputItemId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(status); + writer.String("content"); + writer.StartArray(); + if (withContent) { + writer.StartObject(); + writer.String("type"); + writer.String("output_text"); + writer.String("text"); + writer.String(text.c_str()); + writer.String("annotations"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + } + writer.EndArray(); + writer.EndObject(); + }; + + auto serializePart = [](Writer& writer, const std::string& text) { + writer.StartObject(); + writer.String("type"); + writer.String("output_text"); + writer.String("text"); + writer.String(text.c_str()); + writer.String("annotations"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + }; + + auto serializeResponsesEvent = [](const std::function&)>& eventSerializer) { + StringBuffer eventBuffer; + Writer eventWriter(eventBuffer); + eventSerializer(eventWriter); + return std::string(eventBuffer.GetString()); + }; + + std::vector events; + if (!responsesStreamingInitialized) { + events.emplace_back(serializeResponsesEvent([this, &serializeResponseObject](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.created"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponseObject(writer, "in_progress", "", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId, &serializeOutputItem](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_item.added"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + serializeOutputItem(writer, "", "in_progress", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId, &serializePart](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.content_part.added"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("part"); + serializePart(writer, ""); + writer.EndObject(); + })); + + responsesStreamingInitialized = true; + } + + if (!chunkResponse.empty()) { + responsesStreamingOutputText += chunkResponse; + events.emplace_back(serializeResponsesEvent([this, &chunkResponse, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_text.delta"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("delta"); + writer.String(chunkResponse.c_str()); + writer.String("logprobs"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + })); + } + + if (finishReason != ov::genai::GenerationFinishReason::NONE) { + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_text.done"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("text"); + writer.String(responsesStreamingOutputText.c_str()); + writer.String("logprobs"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId, &serializePart](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.content_part.done"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("part"); + serializePart(writer, responsesStreamingOutputText); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &serializeOutputItem](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_item.done"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + serializeOutputItem(writer, responsesStreamingOutputText, "completed", true); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &serializeResponseObject](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.completed"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponseObject(writer, "completed", responsesStreamingOutputText, true); + writer.EndObject(); + })); + } + + if (events.empty()) { + return ""; + } + + std::stringstream ss; + ss << events.front(); + for (size_t i = 1; i < events.size(); ++i) { + ss << "\n\ndata: " << events[i]; + } + return ss.str(); + } + Document doc; doc.SetObject(); Document::AllocatorType& allocator = doc.GetAllocator(); @@ -1356,6 +2142,9 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { OVMS_PROFILE_FUNCTION(); + if (endpoint == Endpoint::RESPONSES) { + return ""; + } StringBuffer buffer; Writer writer(buffer); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index a94310c22c..f4f961e18d 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -47,6 +47,7 @@ namespace ovms { enum class Endpoint { CHAT_COMPLETIONS, COMPLETIONS, + RESPONSES, TOKENIZE, }; @@ -70,12 +71,16 @@ class OpenAIChatCompletionsHandler { ov::genai::Tokenizer tokenizer; size_t processedTokens = 0; // tracks overall number of tokens processed by the pipeline bool toolCallsDetectedInStream = false; // tracks whether tool calls were detected in any streaming chunk + size_t responsesStreamingSequenceNumber = 0; + bool responsesStreamingInitialized = false; + std::string responsesStreamingOutputText; // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning. std::unique_ptr outputParser = nullptr; absl::Status parseCompletionsPart(); absl::Status parseChatCompletionsPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); + absl::Status parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); diff --git a/src/llm/language_model/continuous_batching/servable.cpp b/src/llm/language_model/continuous_batching/servable.cpp index 470e170a09..1c14944385 100644 --- a/src/llm/language_model/continuous_batching/servable.cpp +++ b/src/llm/language_model/continuous_batching/servable.cpp @@ -103,6 +103,15 @@ static ov::genai::GenerationOutput prepareEmptyStopReasonOutput() { return out; } +static ov::genai::GenerationOutput prepareEmptyNoneReasonOutput() { + static ov::genai::GenerationOutput out = { + std::vector(), // generated_ids + std::vector(), // generated_log_probs + 0.0f, // score + ov::genai::GenerationFinishReason::NONE}; + return out; +} + absl::Status ContinuousBatchingServable::readCompleteExecutionResults(std::shared_ptr& executionContext) { auto cbExecutionContext = std::static_pointer_cast(executionContext); if (cbExecutionContext->payload.client->isDisconnected()) { @@ -136,7 +145,11 @@ absl::Status ContinuousBatchingServable::readPartialExecutionResults(std::shared ov::genai::GenerationOutputs generationOutputs = cbExecutionContext->generationHandle->read(); RET_CHECK(generationOutputs.size() <= 1); // TODO: Support multiple generations if (generationOutputs.size() == 0) { - cbExecutionContext->generationOutputs = {prepareEmptyStopReasonOutput()}; + if (cbExecutionContext->generationHandle->get_status() == ov::genai::GenerationStatus::RUNNING) { + cbExecutionContext->generationOutputs = {prepareEmptyNoneReasonOutput()}; + } else { + cbExecutionContext->generationOutputs = {prepareEmptyStopReasonOutput()}; + } } else { cbExecutionContext->generationOutputs = {generationOutputs.begin()->second}; } diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 6d9810ae5f..b9c619057c 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -68,10 +68,12 @@ absl::Status GenAiServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; } else if (payload.uri == "/v3/completions" || payload.uri == "/v3/v1/completions") { executionContext->endpoint = Endpoint::COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions"); + return absl::InvalidArgumentError("Wrong endpoint. Allowed endpoints: /v3/chat/completions, /v3/completions, /v3/responses, /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -204,6 +206,50 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory().size() > 0) { +#if (PYTHON_DISABLE == 0) + bool success; + if (executionContext->apiHandler->getProcessedJson().size() > 0) { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); + } else { + success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText); + } + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + } +#else + ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); + constexpr bool add_generation_prompt = true; + auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); + if (!toolsStatus.ok()) { + return toolsStatus.status(); + } + const auto& tools = toolsStatus.value(); + auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); + if (!chatTemplateKwargsStatus.ok()) { + return chatTemplateKwargsStatus.status(); + } + const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); + try { + inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools, chatTemplateKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } + } else { + auto prompt = executionContext->apiHandler->getPrompt(); + if (!prompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); + } + inputText = prompt.value(); + } + break; + } case Endpoint::COMPLETIONS: { inputText = executionContext->apiHandler->getPrompt().value(); break; @@ -286,8 +332,12 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptrresponse = wrapTextInServerSideEventMessage(serializedChunk); } - if (executionContext->apiHandler->getStreamOptions().includeUsage) - executionContext->response += wrapTextInServerSideEventMessage(executionContext->apiHandler->serializeStreamingUsageChunk()); + if (executionContext->apiHandler->getStreamOptions().includeUsage) { + std::string usageChunk = executionContext->apiHandler->serializeStreamingUsageChunk(); + if (!usageChunk.empty()) { + executionContext->response += wrapTextInServerSideEventMessage(usageChunk); + } + } executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index be33838d9f..94aef05387 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -45,10 +45,12 @@ absl::Status VisualLanguageModelServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize"); + return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -67,7 +69,7 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrapiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) { + if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); for (size_t i = 0; i < chatHistory.size(); i++) { diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 2834072410..307723415a 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -53,10 +53,12 @@ absl::Status VisualLanguageModelLegacyServable::loadRequest(std::shared_ptrendpoint = Endpoint::CHAT_COMPLETIONS; + } else if (payload.uri == "/v3/responses" || payload.uri == "/v3/v1/responses") { + executionContext->endpoint = Endpoint::RESPONSES; } else if (TokenizeParser::isTokenizeEndpoint(payload.uri)) { executionContext->endpoint = Endpoint::TOKENIZE; } else { - return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions endpoint or /v3/tokenize"); + return absl::InvalidArgumentError("Wrong endpoint. VLM Servable allowed only on /v3/chat/completions, /v3/responses endpoint or /v3/tokenize"); } executionContext->payload = payload; return absl::OkStatus(); @@ -237,7 +239,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrapiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS) { + if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); for (size_t i = 0; i < chatHistory.size(); i++) { diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 2aea78eaae..1ac3f208c7 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -269,6 +269,27 @@ TEST_F(HttpOpenAIHandlerTest, Stream) { ASSERT_EQ(response, ""); } + TEST_F(HttpOpenAIHandlerTest, ResponsesStream) { + std::string requestBody = R"( + { + "model": "gpt", + "stream": true, + "input": "What is OpenVINO?" + } + )"; + + EXPECT_CALL(*writer, PartialReplyBegin(::testing::_)).WillOnce(testing::Invoke([](std::function fn) { fn(); })); + EXPECT_CALL(*writer, PartialReplyEnd()).Times(1); + EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(9); + EXPECT_CALL(*writer, IsDisconnected()).Times(9); + + ASSERT_EQ( + handler->dispatchToProcessor("/v3/responses", requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::PARTIAL_END); + + ASSERT_EQ(response, ""); + } + TEST_F(HttpOpenAIHandlerTest, BodyNotAJson) { std::string requestBody = "not a json"; @@ -609,6 +630,85 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; } +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsOutputText) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"output\":"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"text\":"), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContainsRequiredEvents) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + std::string firstChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(firstChunk.find("\"type\":\"response.created\""), std::string::npos) << firstChunk; + ASSERT_NE(firstChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << firstChunk; + ASSERT_NE(firstChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << firstChunk; + ASSERT_NE(firstChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << firstChunk; + ASSERT_NE(firstChunk.find("\"delta\":\"Hello\""), std::string::npos) << firstChunk; + + std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"text\":\"Hello world\""), std::string::npos) << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingUsageChunkForResponsesIsEmpty) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true, + "stream_options": {"include_usage": true} + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ASSERT_EQ(apiHandler->serializeStreamingUsageChunk(), ""); +} + TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { std::string json = R"({ "model": "llama", @@ -1370,6 +1470,432 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersCompletions } } +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxOutputTokensSetsMaxTokens) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_output_tokens": 7 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), 7); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesStringInputCreatesUserChatMessage) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?" + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1); + ASSERT_TRUE(chatHistory[0].contains("role")); + ASSERT_TRUE(chatHistory[0].contains("content")); + EXPECT_EQ(chatHistory[0]["role"], "user"); + EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); + EXPECT_NE(apiHandler->getProcessedJson().find("\"messages\""), std::string::npos); + EXPECT_NE(apiHandler->getProcessedJson().find("\"role\":\"user\""), std::string::npos); + EXPECT_NE(apiHandler->getProcessedJson().find("\"input\":\"What is OpenVINO?\""), std::string::npos); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesConflictingOutputAndCompletionTokensFails) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_output_tokens": 5, + "max_completion_tokens": 7 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("max_output_tokens and max_completion_tokens must match when both are provided")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesFlatFunctionToolsSucceeds) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location", "unit"] + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "auto"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectSucceeds) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": "get_current_weather" + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + }, + { + "type": "function", + "name": "unused_tool", + "parameters": { + "type": "object", + "properties": { + "arg": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "get_current_weather"); +} + +TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunctionTools) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("Sunny", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"tools\":[{"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"function\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"name\":\"get_current_weather\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunctionToolChoiceObject) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": "get_current_weather" + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("Sunny", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"tool_choice\":{"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"function\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"name\":\"get_current_weather\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectMissingNameFails) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function" + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("tool_choice.name is not a valid string")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectNameNotStringFails) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": { + "type": "function", + "name": 7 + }, + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("tool_choice.name is not a valid string")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlStringSucceeds) { + std::string json = R"({ + "model": "llama", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image", "image_url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="} + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_EQ(apiHandler->getImageHistory().size(), 1); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlObjectSucceeds) { + std::string json = R"({ + "model": "llama", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image", "image_url": {"url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="}} + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_EQ(apiHandler->getImageHistory().size(), 1); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageWithoutImageUrlFails) { + std::string json = R"({ + "model": "llama", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image"} + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("input_image requires image_url field")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlInvalidTypeFails) { + std::string json = R"({ + "model": "llama", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "what is in this image?"}, + {"type": "input_image", "image_url": 123} + ] + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("input_image.image_url must be a string or object")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesUnsupportedToolTypeFails) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "auto", + "tools": [ + { + "type": "web_search_preview" + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Only function tools are supported")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceNoneRemovesTools) { + std::string json = R"({ + "model": "llama", + "input": "What is the weather like in Boston today?", + "tool_choice": "none", + "tools": [ + { + "type": "function", + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_FALSE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "none"); +} + // Provide get_weather2 but take none TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided1_ChoiceNone) { std::string providedTools = R"( diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp index 4dc22d6fa3..5f2b380556 100644 --- a/src/test/llm/visual_language_model/complete_flow_test.cpp +++ b/src/test/llm/visual_language_model/complete_flow_test.cpp @@ -49,6 +49,7 @@ class VLMServableExecutionTest : public ::testing::Test { std::unordered_map headers{{"content-type", "application/json"}}; ovms::HttpRequestComponents comp; const std::string endpointChatCompletions = "/v3/chat/completions"; + const std::string endpointResponses = "/v3/responses"; std::shared_ptr writer; std::shared_ptr multiPartParser; std::string response; @@ -129,6 +130,50 @@ static std::string createRequestBody(const std::string& modelName, const std::ve return oss.str(); } +static std::string createResponsesRequestBody(const std::string& modelName, const std::vector>& fields, bool includeText = true, int numberOfImages = 1, const std::string contentOfTheFirstMessage = "What is in this image?") { + std::ostringstream oss; + oss << R"( + { + "model": ")" + << modelName << R"(", + "input": [ + { + "role": "user", + "content": [)"; + if (includeText) { + oss << R"( + { + "type": "input_text", + "text": ")"; + oss << contentOfTheFirstMessage; + oss << R"("})"; + if (numberOfImages > 0) { + oss << ","; + } + } + for (int i = 0; i < numberOfImages; i++) { + oss << R"( + { + "type": "input_image", + "image_url": "data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGIy+/oREAAA//8DiQIftNKCRwAAAABJRU5ErkJggg==" + })"; + if (i < numberOfImages - 1) { + oss << ","; + } + } + oss << R"( + ] + } + ] + )"; + for (const auto& field : fields) { + oss << R"(, ")" << field.first << R"(": )" << field.second << R"()" + << "\n"; + } + oss << "\n}"; + return oss.str(); +} + class VLMServableExecutionTestParameterized : public VLMServableExecutionTest, public ::testing::WithParamInterface {}; // Unary flow @@ -304,6 +349,152 @@ TEST_P(VLMServableExecutionTestParameterized, unaryBasicWithTools) { EXPECT_STREQ(parsedResponse["model"].GetString(), modelName.c_str()); } +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithImageInput) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("model")); + EXPECT_STREQ(parsedResponse["model"].GetString(), modelName.c_str()); + ASSERT_TRUE(parsedResponse.HasMember("output")); + ASSERT_TRUE(parsedResponse["output"].IsArray()); + ASSERT_GT(parsedResponse["output"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse["output"][0].IsObject()); + ASSERT_TRUE(parsedResponse["output"][0].HasMember("type")); + EXPECT_STREQ(parsedResponse["output"][0]["type"].GetString(), "message"); + ASSERT_TRUE(parsedResponse["output"][0].HasMember("content")); + ASSERT_TRUE(parsedResponse["output"][0]["content"].IsArray()); + ASSERT_GT(parsedResponse["output"][0]["content"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse["output"][0]["content"][0].HasMember("type")); + EXPECT_STREQ(parsedResponse["output"][0]["content"][0]["type"].GetString(), "output_text"); + + ASSERT_TRUE(parsedResponse.HasMember("usage")); + ASSERT_TRUE(parsedResponse["usage"].IsObject()); + ASSERT_TRUE(parsedResponse["usage"].HasMember("input_tokens")); + ASSERT_TRUE(parsedResponse["usage"].HasMember("output_tokens")); + ASSERT_TRUE(parsedResponse["usage"].HasMember("total_tokens")); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesOnlyImageInput) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields, false, 1); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("output")); + ASSERT_TRUE(parsedResponse["output"].IsArray()); + ASSERT_GT(parsedResponse["output"].GetArray().Size(), 0); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithTools) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}, + {"tool_choice", R"("auto")"}, + {"tools", R"([ + { + "type": "function", + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + ])"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("object")); + EXPECT_STREQ(parsedResponse["object"].GetString(), "response"); + ASSERT_TRUE(parsedResponse.HasMember("tools")); + ASSERT_TRUE(parsedResponse["tools"].IsArray()); + ASSERT_GT(parsedResponse["tools"].GetArray().Size(), 0); + ASSERT_TRUE(parsedResponse.HasMember("tool_choice")); + ASSERT_TRUE(parsedResponse["tool_choice"].IsString()); + EXPECT_STREQ(parsedResponse["tool_choice"].GetString(), "auto"); +} + +TEST_P(VLMServableExecutionTestParameterized, unaryResponsesWithFunctionToolChoiceObject) { + auto modelName = GetParam(); + std::vector> fields = { + {"max_output_tokens", "5"}, + {"temperature", "0.0"}, + {"tool_choice", R"({"type":"function","name":"get_weather"})"}, + {"tools", R"([ + { + "type": "function", + "name": "get_weather", + "description": "Get weather by city", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string" + } + }, + "required": ["city"] + } + } + ])"}}; + std::string requestBody = createResponsesRequestBody(modelName, fields); + + ovms::HttpRequestComponents responsesComp; + ASSERT_EQ(handler->parseRequestComponents(responsesComp, "POST", endpointResponses, headers), ovms::StatusCode::OK); + + ASSERT_EQ( + handler->dispatchToProcessor(endpointResponses, requestBody, &response, responsesComp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); + + parsedResponse.Parse(response.c_str()); + ASSERT_TRUE(parsedResponse.IsObject()); + ASSERT_TRUE(parsedResponse.HasMember("tool_choice")); + ASSERT_TRUE(parsedResponse["tool_choice"].IsObject()); + ASSERT_TRUE(parsedResponse["tool_choice"].HasMember("type")); + EXPECT_STREQ(parsedResponse["tool_choice"]["type"].GetString(), "function"); + ASSERT_TRUE(parsedResponse["tool_choice"].HasMember("name")); + EXPECT_STREQ(parsedResponse["tool_choice"]["name"].GetString(), "get_weather"); +} + // Stream flow TEST_P(VLMServableExecutionTestParameterized, streamBasic) { From b1a6c40e14f0a70cb461726de3b1f8fe90fc4cd2 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 5 Mar 2026 16:03:05 +0100 Subject: [PATCH 02/24] fix --- src/llm/apis/openai_completions.cpp | 325 ++++++++++-------- src/llm/apis/openai_completions.hpp | 1 + .../continuous_batching/servable.cpp | 15 +- 3 files changed, 190 insertions(+), 151 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index afcd8daf8e..af87a6a971 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -299,125 +299,6 @@ absl::Status normalizeResponsesFunctionToolsInPlace(rapidjson::Document& doc) { return absl::OkStatus(); } -absl::Status normalizeResponsesInputToMessagesInPlace(rapidjson::Document& doc) { - auto inputIt = doc.FindMember("input"); - if (inputIt == doc.MemberEnd()) { - return absl::InvalidArgumentError("input missing in request"); - } - auto& allocator = doc.GetAllocator(); - if (inputIt->value.IsString()) { - rapidjson::Value messages(rapidjson::kArrayType); - rapidjson::Value messageObj(rapidjson::kObjectType); - messageObj.AddMember("role", "user", allocator); - messageObj.AddMember("content", rapidjson::Value(inputIt->value.GetString(), allocator), allocator); - messages.PushBack(messageObj, allocator); - - auto existingMessages = doc.FindMember("messages"); - if (existingMessages != doc.MemberEnd()) { - existingMessages->value = messages; - } else { - doc.AddMember("messages", messages, allocator); - } - return absl::OkStatus(); - } - if (!inputIt->value.IsArray()) { - return absl::InvalidArgumentError("input is not a string or array"); - } - - rapidjson::Value messages(rapidjson::kArrayType); - for (auto& item : inputIt->value.GetArray()) { - if (!item.IsObject()) { - return absl::InvalidArgumentError("input array items must be objects"); - } - - auto itemObj = item.GetObject(); - auto roleIt = itemObj.FindMember("role"); - if (roleIt == itemObj.MemberEnd() || !roleIt->value.IsString()) { - return absl::InvalidArgumentError("input item role is missing or invalid"); - } - - rapidjson::Value messageObj(rapidjson::kObjectType); - messageObj.AddMember("role", rapidjson::Value(roleIt->value.GetString(), allocator), allocator); - - auto contentIt = itemObj.FindMember("content"); - if (contentIt == itemObj.MemberEnd()) { - return absl::InvalidArgumentError("input item content is missing"); - } - - if (contentIt->value.IsString()) { - messageObj.AddMember("content", rapidjson::Value(contentIt->value.GetString(), allocator), allocator); - messages.PushBack(messageObj, allocator); - continue; - } - - if (!contentIt->value.IsArray()) { - return absl::InvalidArgumentError("input item content must be a string or array"); - } - - rapidjson::Value normalizedContent(rapidjson::kArrayType); - for (auto& contentItem : contentIt->value.GetArray()) { - if (!contentItem.IsObject()) { - return absl::InvalidArgumentError("input content items must be objects"); - } - auto contentObj = contentItem.GetObject(); - auto typeIt = contentObj.FindMember("type"); - if (typeIt == contentObj.MemberEnd() || !typeIt->value.IsString()) { - return absl::InvalidArgumentError("input content item type is missing or invalid"); - } - - std::string type = typeIt->value.GetString(); - if (type == "input_text") { - auto textIt = contentObj.FindMember("text"); - if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) { - return absl::InvalidArgumentError("input_text requires a valid text field"); - } - rapidjson::Value textObj(rapidjson::kObjectType); - textObj.AddMember("type", "text", allocator); - textObj.AddMember("text", rapidjson::Value(textIt->value.GetString(), allocator), allocator); - normalizedContent.PushBack(textObj, allocator); - } else if (type == "input_image") { - std::string imageUrl; - auto imageUrlIt = contentObj.FindMember("image_url"); - if (imageUrlIt == contentObj.MemberEnd()) { - return absl::InvalidArgumentError("input_image requires image_url field"); - } - if (imageUrlIt->value.IsString()) { - imageUrl = imageUrlIt->value.GetString(); - } else if (imageUrlIt->value.IsObject()) { - auto imageUrlObj = imageUrlIt->value.GetObject(); - auto urlIt = imageUrlObj.FindMember("url"); - if (urlIt == imageUrlObj.MemberEnd() || !urlIt->value.IsString()) { - return absl::InvalidArgumentError("input_image.image_url.url is missing or invalid"); - } - imageUrl = urlIt->value.GetString(); - } else { - return absl::InvalidArgumentError("input_image.image_url must be a string or object"); - } - - rapidjson::Value imageUrlObj(rapidjson::kObjectType); - imageUrlObj.AddMember("url", rapidjson::Value(imageUrl.c_str(), allocator), allocator); - - rapidjson::Value imageObj(rapidjson::kObjectType); - imageObj.AddMember("type", "image_url", allocator); - imageObj.AddMember("image_url", imageUrlObj, allocator); - normalizedContent.PushBack(imageObj, allocator); - } else { - return absl::InvalidArgumentError("Unsupported content type"); - } - } - messageObj.AddMember("content", normalizedContent, allocator); - messages.PushBack(messageObj, allocator); - } - - auto existingMessages = doc.FindMember("messages"); - if (existingMessages != doc.MemberEnd()) { - existingMessages->value = messages; - } else { - doc.AddMember("messages", messages, allocator); - } - return absl::OkStatus(); -} - } // namespace absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { @@ -570,6 +451,193 @@ absl::Status OpenAIChatCompletionsHandler::ensureArgumentsInToolCalls(Value& mes return absl::OkStatus(); } +absl::Status OpenAIChatCompletionsHandler::parseResponsesInputDirectly(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { + auto inputIt = doc.FindMember("input"); + if (inputIt == doc.MemberEnd()) { + return absl::InvalidArgumentError("input missing in request"); + } + + auto& allocator = doc.GetAllocator(); + rapidjson::Value messages(rapidjson::kArrayType); + + if (inputIt->value.IsString()) { + request.prompt = inputIt->value.GetString(); + if (!request.prompt.has_value() || request.prompt.value().empty()) { + return absl::InvalidArgumentError("input cannot be empty"); + } + + request.chatHistory.push_back({}); + request.chatHistory.last()["role"] = "user"; + request.chatHistory.last()["content"] = request.prompt.value(); + + rapidjson::Value messageObj(rapidjson::kObjectType); + messageObj.AddMember("role", "user", allocator); + messageObj.AddMember("content", rapidjson::Value(request.prompt->c_str(), allocator), allocator); + messages.PushBack(messageObj, allocator); + } else if (inputIt->value.IsArray()) { + if (inputIt->value.GetArray().Size() == 0) { + return absl::InvalidArgumentError("Messages array cannot be empty"); + } + + for (size_t i = 0; i < inputIt->value.GetArray().Size(); ++i) { + auto& item = inputIt->value.GetArray()[i]; + if (!item.IsObject()) { + return absl::InvalidArgumentError("input array items must be objects"); + } + + auto itemObj = item.GetObject(); + auto roleIt = itemObj.FindMember("role"); + if (roleIt == itemObj.MemberEnd() || !roleIt->value.IsString()) { + return absl::InvalidArgumentError("input item role is missing or invalid"); + } + + request.chatHistory.push_back({}); + request.chatHistory.last()["role"] = roleIt->value.GetString(); + + rapidjson::Value messageObj(rapidjson::kObjectType); + messageObj.AddMember("role", rapidjson::Value(roleIt->value.GetString(), allocator), allocator); + + auto contentIt = itemObj.FindMember("content"); + if (contentIt == itemObj.MemberEnd()) { + return absl::InvalidArgumentError("input item content is missing"); + } + + if (contentIt->value.IsString()) { + messageObj.AddMember("content", rapidjson::Value(contentIt->value.GetString(), allocator), allocator); + request.chatHistory.last()["content"] = contentIt->value.GetString(); + messages.PushBack(messageObj, allocator); + continue; + } + + if (!contentIt->value.IsArray()) { + return absl::InvalidArgumentError("input item content must be a string or array"); + } + if (contentIt->value.GetArray().Size() == 0) { + return absl::InvalidArgumentError("Invalid message structure - content array is empty"); + } + + std::string contentText; + for (auto& contentItem : contentIt->value.GetArray()) { + if (!contentItem.IsObject()) { + return absl::InvalidArgumentError("input content items must be objects"); + } + auto contentObj = contentItem.GetObject(); + auto typeIt = contentObj.FindMember("type"); + if (typeIt == contentObj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("input content item type is missing or invalid"); + } + + const std::string type = typeIt->value.GetString(); + if (type == "input_text") { + auto textIt = contentObj.FindMember("text"); + if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) { + return absl::InvalidArgumentError("input_text requires a valid text field"); + } + contentText = textIt->value.GetString(); + } else if (type == "input_image") { + std::string imageUrl; + auto imageUrlIt = contentObj.FindMember("image_url"); + if (imageUrlIt == contentObj.MemberEnd()) { + return absl::InvalidArgumentError("input_image requires image_url field"); + } + if (imageUrlIt->value.IsString()) { + imageUrl = imageUrlIt->value.GetString(); + } else if (imageUrlIt->value.IsObject()) { + auto imageUrlObj = imageUrlIt->value.GetObject(); + auto urlIt = imageUrlObj.FindMember("url"); + if (urlIt == imageUrlObj.MemberEnd() || !urlIt->value.IsString()) { + return absl::InvalidArgumentError("input_image.image_url.url is missing or invalid"); + } + imageUrl = urlIt->value.GetString(); + } else { + return absl::InvalidArgumentError("input_image.image_url must be a string or object"); + } + + std::string pattern = "base64,"; + std::size_t pos = imageUrl.find(pattern); + std::string decoded; + ov::Tensor tensor; + if (pos != std::string::npos) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image from base64 string"); + size_t offset = pos + pattern.length(); + if (!absl::Base64Unescape(std::string_view(imageUrl.data() + offset, imageUrl.size() - offset), &decoded)) { + return absl::InvalidArgumentError("Invalid base64 string in request"); + } + try { + tensor = loadImageStbiFromMemory(decoded); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + } else if (std::regex_match(imageUrl.c_str(), std::regex("^(http|https|ftp|sftp|)://(.*)"))) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image using curl"); + int64_t sizeLimit = 20000000; // restrict single image size to 20MB + if (!allowedMediaDomains.has_value() || !isDomainAllowed(allowedMediaDomains.value(), imageUrl.c_str())) { + return absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains"); + } + auto status = downloadImage(imageUrl.c_str(), decoded, sizeLimit); + if (status != absl::OkStatus()) { + return status; + } + try { + tensor = loadImageStbiFromMemory(decoded); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError("Image parsing failed"); + } + } else { + if (!allowedLocalMediaPath.has_value()) { + return absl::InvalidArgumentError("Loading images from local filesystem is disabled."); + } + if (FileSystem::isPathEscaped(imageUrl)) { + std::stringstream ss; + ss << "Path " << imageUrl.c_str() << " escape with .. is forbidden."; + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Loading image from local filesystem"); + const auto firstMissmatch = std::mismatch(imageUrl.begin(), imageUrl.end(), allowedLocalMediaPath.value().begin(), allowedLocalMediaPath.value().end()); + if (firstMissmatch.second != allowedLocalMediaPath.value().end()) { + return absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path"); + } + try { + tensor = loadImageStbiFromFile(imageUrl.c_str()); + } catch (std::runtime_error& e) { + std::stringstream ss; + ss << "Image file " << imageUrl.c_str() << " parsing failed: " << e.what(); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + return absl::InvalidArgumentError(ss.str()); + } + } + request.imageHistory.push_back({i, tensor}); + } else { + return absl::InvalidArgumentError("Unsupported content type"); + } + } + + messageObj.AddMember("content", rapidjson::Value(contentText.c_str(), allocator), allocator); + request.chatHistory.last()["content"] = contentText; + messages.PushBack(messageObj, allocator); + } + } else { + return absl::InvalidArgumentError("input is not a string or array"); + } + + auto existingMessages = doc.FindMember("messages"); + if (existingMessages != doc.MemberEnd()) { + existingMessages->value = messages; + } else { + doc.AddMember("messages", messages, allocator); + } + + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed responses input directly to chat history"); + return absl::OkStatus(); +} + absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { auto it = doc.FindMember("messages"); if (it == doc.MemberEnd()) @@ -1000,24 +1068,7 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optionalvalue.IsString()) { - request.prompt = it->value.GetString(); - if (!request.prompt.has_value() || !request.prompt.value().size()) { - return absl::InvalidArgumentError("input cannot be empty"); - } - } - - auto messagesStatus = parseMessages(allowedLocalMediaPath, allowedMediaDomains); + auto messagesStatus = parseResponsesInputDirectly(allowedLocalMediaPath, allowedMediaDomains); if (!messagesStatus.ok()) { return messagesStatus; } diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index f4f961e18d..69da9b76d2 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -81,6 +81,7 @@ class OpenAIChatCompletionsHandler { absl::Status parseCompletionsPart(); absl::Status parseChatCompletionsPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); + absl::Status parseResponsesInputDirectly(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); diff --git a/src/llm/language_model/continuous_batching/servable.cpp b/src/llm/language_model/continuous_batching/servable.cpp index 1c14944385..470e170a09 100644 --- a/src/llm/language_model/continuous_batching/servable.cpp +++ b/src/llm/language_model/continuous_batching/servable.cpp @@ -103,15 +103,6 @@ static ov::genai::GenerationOutput prepareEmptyStopReasonOutput() { return out; } -static ov::genai::GenerationOutput prepareEmptyNoneReasonOutput() { - static ov::genai::GenerationOutput out = { - std::vector(), // generated_ids - std::vector(), // generated_log_probs - 0.0f, // score - ov::genai::GenerationFinishReason::NONE}; - return out; -} - absl::Status ContinuousBatchingServable::readCompleteExecutionResults(std::shared_ptr& executionContext) { auto cbExecutionContext = std::static_pointer_cast(executionContext); if (cbExecutionContext->payload.client->isDisconnected()) { @@ -145,11 +136,7 @@ absl::Status ContinuousBatchingServable::readPartialExecutionResults(std::shared ov::genai::GenerationOutputs generationOutputs = cbExecutionContext->generationHandle->read(); RET_CHECK(generationOutputs.size() <= 1); // TODO: Support multiple generations if (generationOutputs.size() == 0) { - if (cbExecutionContext->generationHandle->get_status() == ov::genai::GenerationStatus::RUNNING) { - cbExecutionContext->generationOutputs = {prepareEmptyNoneReasonOutput()}; - } else { - cbExecutionContext->generationOutputs = {prepareEmptyStopReasonOutput()}; - } + cbExecutionContext->generationOutputs = {prepareEmptyStopReasonOutput()}; } else { cbExecutionContext->generationOutputs = {generationOutputs.begin()->second}; } From b5e9707c8a62b16f5c56a9f25fcfbbe8ab723550 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 5 Mar 2026 16:04:40 +0100 Subject: [PATCH 03/24] style --- src/test/http_openai_handler_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 1ac3f208c7..1ea266a221 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -269,7 +269,7 @@ TEST_F(HttpOpenAIHandlerTest, Stream) { ASSERT_EQ(response, ""); } - TEST_F(HttpOpenAIHandlerTest, ResponsesStream) { +TEST_F(HttpOpenAIHandlerTest, ResponsesStream) { std::string requestBody = R"( { "model": "gpt", @@ -284,11 +284,11 @@ TEST_F(HttpOpenAIHandlerTest, Stream) { EXPECT_CALL(*writer, IsDisconnected()).Times(9); ASSERT_EQ( - handler->dispatchToProcessor("/v3/responses", requestBody, &response, comp, responseComponents, writer, multiPartParser), - ovms::StatusCode::PARTIAL_END); + handler->dispatchToProcessor("/v3/responses", requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::PARTIAL_END); ASSERT_EQ(response, ""); - } +} TEST_F(HttpOpenAIHandlerTest, BodyNotAJson) { std::string requestBody = "not a json"; From 7d9bf2fce9a5401b28ac2ca274b60f4e7a8a5ebe Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 5 Mar 2026 16:25:36 +0100 Subject: [PATCH 04/24] fix --- src/test/http_openai_handler_test.cpp | 269 ++++++++++++++++++++------ 1 file changed, 215 insertions(+), 54 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 1ea266a221..9eca657f85 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -14,6 +14,7 @@ // limitations under the License. //***************************************************************************** #include +#include #include #include #include @@ -314,60 +315,6 @@ TEST_F(HttpOpenAIHandlerTest, JsonBodyValidButNotAnObject) { ASSERT_EQ(status.string(), "The file is not valid json - JSON body must be an object"); } -TEST_F(HttpOpenAIHandlerTest, ModelFieldMissing) { - std::string requestBody = R"( - { - "stream": true, - "messages": [] - } - )"; - - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); - - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - model field is missing in JSON body"); -} - -TEST_F(HttpOpenAIHandlerTest, ModelFieldNotAString) { - std::string requestBody = R"( - { - "model": 2, - "stream": true, - "messages": [] - } - )"; - - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); - - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - model field is not a string"); -} - -TEST_F(HttpOpenAIHandlerTest, StreamFieldNotABoolean) { - std::string requestBody = R"( - { - "model": "gpt", - "stream": 2, - "messages": [] - } - )"; - - EXPECT_CALL(*writer, PartialReplyBegin(::testing::_)).Times(0); - EXPECT_CALL(*writer, PartialReplyEnd()).Times(0); - EXPECT_CALL(*writer, PartialReply(::testing::_)).Times(0); - EXPECT_CALL(*writer, IsDisconnected()).Times(0); - - auto status = handler->dispatchToProcessor("/v3/completions", requestBody, &response, comp, responseComponents, writer, multiPartParser); - ASSERT_EQ(status, ovms::StatusCode::JSON_INVALID); - ASSERT_EQ(status.string(), "The file is not valid json - stream field is not a boolean"); -} - TEST_F(HttpOpenAIHandlerTest, GraphWithANameDoesNotExist) { std::string requestBody = R"( { @@ -423,6 +370,220 @@ class HttpOpenAIHandlerParsingTest : public ::testing::Test { } }; +class HttpOpenAIHandlerCommonParsingValidationTest : public HttpOpenAIHandlerParsingTest, + public ::testing::WithParamInterface { +protected: + ovms::Endpoint endpoint() const { + return GetParam(); + } + + std::string createRequestWithRawStreamValue(const std::string& streamRawValue) const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"input\":\"valid prompt\"}"; + } + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } + + std::string createRequestWithoutModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"input\":\"valid prompt\"}"; + } + return "{\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } + + std::string createRequestWithNonStringModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"model\":2,\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"model\":2,\"input\":\"valid prompt\"}"; + } + return "{\"model\":2,\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; + } +}; + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, StreamFieldNotABooleanFails) { + std::string json = createRequestWithRawStreamValue("2"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Stream is not bool")); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldMissingFails) { + std::string json = createRequestWithoutModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model missing in request")); +} + +TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldNotStringFails) { + std::string json = createRequestWithNonStringModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model is not a string")); +} + +INSTANTIATE_TEST_SUITE_P( + CommonParsingValidation, + HttpOpenAIHandlerCommonParsingValidationTest, + ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::COMPLETIONS, ovms::Endpoint::RESPONSES), + [](const testing::TestParamInfo& info) { + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::COMPLETIONS: + return "Completions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } + }); + + class HttpOpenAIHandlerChatAndResponsesParsingTest : public HttpOpenAIHandlerParsingTest, + public ::testing::WithParamInterface { + protected: + ovms::Endpoint endpoint() const { + return GetParam(); + } + + std::string createTextRequest(const std::string& text, const std::string& extraJsonFields = "") const { + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":\"") + text + "\"" + extraJsonFields + "}"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"") + text + "\"}]" + extraJsonFields + "}"; + } + + std::string createMultimodalRequestWithImageUrl(const std::string& dataUrl) const { + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":[{\"role\":\"user\",\"content\":[{\"type\":\"input_text\",\"text\":\"what is in this image?\"},{\"type\":\"input_image\",\"image_url\":\"") + dataUrl + "\"}]}] }"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"what is in this image?\"},{\"type\":\"image_url\",\"image_url\":{\"url\":\"") + dataUrl + "\"}}]}]}"; + } + + std::string createToolRequest(const std::string& toolChoiceJson) const { + std::string base = createTextRequest("What is the weather like in Boston today?", ",\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\"}},\"required\":[\"location\"]}}}]"); + if (toolChoiceJson.empty()) { + return base; + } + base.pop_back(); // remove trailing '}' + base += ",\"tool_choice\":" + toolChoiceJson + "}"; + return base; + } + + std::shared_ptr parseCurrentRequest(const std::string& json) { + doc.Parse(json.c_str()); + EXPECT_FALSE(doc.HasParseError()) << json; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << json; + return apiHandler; + } + }; + + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUserChatMessage) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1); + ASSERT_TRUE(chatHistory[0].contains("role")); + ASSERT_TRUE(chatHistory[0].contains("content")); + EXPECT_EQ(chatHistory[0]["role"], "user"); + EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); + if (endpoint() == ovms::Endpoint::RESPONSES) { + EXPECT_NE(apiHandler->getProcessedJson().find("\"messages\""), std::string::npos); + } else { + EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + } + } + + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { + std::string tokenField = endpoint() == ovms::Endpoint::RESPONSES ? "max_output_tokens" : "max_completion_tokens"; + std::string json = createTextRequest("valid prompt", ",\"" + tokenField + "\":7"); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), 7); + } + + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingFunctionToolsWithAutoChoiceSucceeds) { + std::string json = createToolRequest("\"auto\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "auto"); + } + + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceFunctionObjectSucceeds) { + std::string json = createToolRequest("{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\"}}"); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "get_current_weather"); + } + + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceNoneRemovesTools) { + std::string json = createToolRequest("\"none\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_FALSE(apiHandler->areToolsAvailable()); + EXPECT_EQ(apiHandler->getToolChoice(), "none"); + } + + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultimodalInputImageSucceeds) { + const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; + std::string json = createMultimodalRequestWithImageUrl(base64Image); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_EQ(apiHandler->getImageHistory().size(), 1); + } + + INSTANTIATE_TEST_SUITE_P( + ChatAndResponses, + HttpOpenAIHandlerChatAndResponsesParsingTest, + ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::RESPONSES), + [](const testing::TestParamInfo& info) { + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } + }); + static std::vector createHermes3ToolCallTokens(ov::genai::Tokenizer& tokenizer) { std::string toolCall = R"({"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}})"; auto generatedTensor = tokenizer.encode(toolCall, ov::genai::add_special_tokens(true)).input_ids; From 85ddf6c1a7bfed358e82ac47a79297c5e95485e2 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 5 Mar 2026 16:27:12 +0100 Subject: [PATCH 05/24] style --- src/test/http_openai_handler_test.cpp | 242 +++++++++++++------------- 1 file changed, 121 insertions(+), 121 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 9eca657f85..9f0e3c0441 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -371,147 +371,147 @@ class HttpOpenAIHandlerParsingTest : public ::testing::Test { }; class HttpOpenAIHandlerCommonParsingValidationTest : public HttpOpenAIHandlerParsingTest, - public ::testing::WithParamInterface { + public ::testing::WithParamInterface { protected: - ovms::Endpoint endpoint() const { - return GetParam(); - } - - std::string createRequestWithRawStreamValue(const std::string& streamRawValue) const { - if (endpoint() == ovms::Endpoint::COMPLETIONS) { - return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"prompt\":\"valid prompt\"}"; - } - if (endpoint() == ovms::Endpoint::RESPONSES) { - return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"input\":\"valid prompt\"}"; + ovms::Endpoint endpoint() const { + return GetParam(); } - return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; - } - std::string createRequestWithoutModel() const { - if (endpoint() == ovms::Endpoint::COMPLETIONS) { - return "{\"prompt\":\"valid prompt\"}"; - } - if (endpoint() == ovms::Endpoint::RESPONSES) { - return "{\"input\":\"valid prompt\"}"; + std::string createRequestWithRawStreamValue(const std::string& streamRawValue) const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"input\":\"valid prompt\"}"; + } + return std::string("{\"model\":\"llama\",\"stream\":") + streamRawValue + ",\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; } - return "{\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; - } - std::string createRequestWithNonStringModel() const { - if (endpoint() == ovms::Endpoint::COMPLETIONS) { - return "{\"model\":2,\"prompt\":\"valid prompt\"}"; + std::string createRequestWithoutModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"input\":\"valid prompt\"}"; + } + return "{\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; } - if (endpoint() == ovms::Endpoint::RESPONSES) { - return "{\"model\":2,\"input\":\"valid prompt\"}"; + + std::string createRequestWithNonStringModel() const { + if (endpoint() == ovms::Endpoint::COMPLETIONS) { + return "{\"model\":2,\"prompt\":\"valid prompt\"}"; + } + if (endpoint() == ovms::Endpoint::RESPONSES) { + return "{\"model\":2,\"input\":\"valid prompt\"}"; + } + return "{\"model\":2,\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; } - return "{\"model\":2,\"messages\":[{\"role\":\"user\",\"content\":\"valid prompt\"}]}"; - } }; TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, StreamFieldNotABooleanFails) { - std::string json = createRequestWithRawStreamValue("2"); - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); + std::string json = createRequestWithRawStreamValue("2"); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - std::shared_ptr apiHandler = - std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Stream is not bool")); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("Stream is not bool")); } TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldMissingFails) { - std::string json = createRequestWithoutModel(); - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); + std::string json = createRequestWithoutModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - std::shared_ptr apiHandler = - std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model missing in request")); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model missing in request")); } TEST_P(HttpOpenAIHandlerCommonParsingValidationTest, ModelFieldNotStringFails) { - std::string json = createRequestWithNonStringModel(); - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); + std::string json = createRequestWithNonStringModel(); + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - std::shared_ptr apiHandler = - std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model is not a string")); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("model is not a string")); } INSTANTIATE_TEST_SUITE_P( - CommonParsingValidation, - HttpOpenAIHandlerCommonParsingValidationTest, - ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::COMPLETIONS, ovms::Endpoint::RESPONSES), - [](const testing::TestParamInfo& info) { - switch (info.param) { - case ovms::Endpoint::CHAT_COMPLETIONS: - return "ChatCompletions"; - case ovms::Endpoint::COMPLETIONS: - return "Completions"; - case ovms::Endpoint::RESPONSES: - return "Responses"; - default: - return "Unknown"; - } - }); + CommonParsingValidation, + HttpOpenAIHandlerCommonParsingValidationTest, + ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::COMPLETIONS, ovms::Endpoint::RESPONSES), + [](const testing::TestParamInfo& info) { + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::COMPLETIONS: + return "Completions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } + }); - class HttpOpenAIHandlerChatAndResponsesParsingTest : public HttpOpenAIHandlerParsingTest, - public ::testing::WithParamInterface { - protected: +class HttpOpenAIHandlerChatAndResponsesParsingTest : public HttpOpenAIHandlerParsingTest, + public ::testing::WithParamInterface { +protected: ovms::Endpoint endpoint() const { - return GetParam(); + return GetParam(); } std::string createTextRequest(const std::string& text, const std::string& extraJsonFields = "") const { - if (endpoint() == ovms::Endpoint::RESPONSES) { - return std::string("{\"model\":\"llama\",\"input\":\"") + text + "\"" + extraJsonFields + "}"; - } - return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"") + text + "\"}]" + extraJsonFields + "}"; + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":\"") + text + "\"" + extraJsonFields + "}"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"") + text + "\"}]" + extraJsonFields + "}"; } std::string createMultimodalRequestWithImageUrl(const std::string& dataUrl) const { - if (endpoint() == ovms::Endpoint::RESPONSES) { - return std::string("{\"model\":\"llama\",\"input\":[{\"role\":\"user\",\"content\":[{\"type\":\"input_text\",\"text\":\"what is in this image?\"},{\"type\":\"input_image\",\"image_url\":\"") + dataUrl + "\"}]}] }"; - } - return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"what is in this image?\"},{\"type\":\"image_url\",\"image_url\":{\"url\":\"") + dataUrl + "\"}}]}]}"; + if (endpoint() == ovms::Endpoint::RESPONSES) { + return std::string("{\"model\":\"llama\",\"input\":[{\"role\":\"user\",\"content\":[{\"type\":\"input_text\",\"text\":\"what is in this image?\"},{\"type\":\"input_image\",\"image_url\":\"") + dataUrl + "\"}]}] }"; + } + return std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"text\",\"text\":\"what is in this image?\"},{\"type\":\"image_url\",\"image_url\":{\"url\":\"") + dataUrl + "\"}}]}]}"; } std::string createToolRequest(const std::string& toolChoiceJson) const { - std::string base = createTextRequest("What is the weather like in Boston today?", ",\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\"}},\"required\":[\"location\"]}}}]"); - if (toolChoiceJson.empty()) { + std::string base = createTextRequest("What is the weather like in Boston today?", ",\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\"}},\"required\":[\"location\"]}}}]"); + if (toolChoiceJson.empty()) { + return base; + } + base.pop_back(); // remove trailing '}' + base += ",\"tool_choice\":" + toolChoiceJson + "}"; return base; - } - base.pop_back(); // remove trailing '}' - base += ",\"tool_choice\":" + toolChoiceJson + "}"; - return base; } std::shared_ptr parseCurrentRequest(const std::string& json) { - doc.Parse(json.c_str()); - EXPECT_FALSE(doc.HasParseError()) << json; - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - std::shared_ptr apiHandler = - std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << json; - return apiHandler; + doc.Parse(json.c_str()); + EXPECT_FALSE(doc.HasParseError()) << json; + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = + std::make_shared(doc, endpoint(), std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << json; + return apiHandler; } - }; +}; - TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUserChatMessage) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUserChatMessage) { std::string json = createTextRequest("What is OpenVINO?"); auto apiHandler = parseCurrentRequest(json); @@ -522,66 +522,66 @@ INSTANTIATE_TEST_SUITE_P( EXPECT_EQ(chatHistory[0]["role"], "user"); EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_NE(apiHandler->getProcessedJson().find("\"messages\""), std::string::npos); + EXPECT_NE(apiHandler->getProcessedJson().find("\"messages\""), std::string::npos); } else { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + EXPECT_TRUE(apiHandler->getProcessedJson().empty()); } - } +} - TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { std::string tokenField = endpoint() == ovms::Endpoint::RESPONSES ? "max_output_tokens" : "max_completion_tokens"; std::string json = createTextRequest("valid prompt", ",\"" + tokenField + "\":7"); auto apiHandler = parseCurrentRequest(json); EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); EXPECT_EQ(apiHandler->getMaxTokens().value(), 7); - } +} - TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingFunctionToolsWithAutoChoiceSucceeds) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingFunctionToolsWithAutoChoiceSucceeds) { std::string json = createToolRequest("\"auto\""); auto apiHandler = parseCurrentRequest(json); EXPECT_TRUE(apiHandler->areToolsAvailable()); EXPECT_EQ(apiHandler->getToolChoice(), "auto"); - } +} - TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceFunctionObjectSucceeds) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceFunctionObjectSucceeds) { std::string json = createToolRequest("{\"type\":\"function\",\"function\":{\"name\":\"get_current_weather\"}}"); auto apiHandler = parseCurrentRequest(json); EXPECT_TRUE(apiHandler->areToolsAvailable()); EXPECT_EQ(apiHandler->getToolChoice(), "get_current_weather"); - } +} - TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceNoneRemovesTools) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingToolChoiceNoneRemovesTools) { std::string json = createToolRequest("\"none\""); auto apiHandler = parseCurrentRequest(json); EXPECT_FALSE(apiHandler->areToolsAvailable()); EXPECT_EQ(apiHandler->getToolChoice(), "none"); - } +} - TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultimodalInputImageSucceeds) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultimodalInputImageSucceeds) { const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; std::string json = createMultimodalRequestWithImageUrl(base64Image); auto apiHandler = parseCurrentRequest(json); EXPECT_EQ(apiHandler->getImageHistory().size(), 1); - } +} - INSTANTIATE_TEST_SUITE_P( +INSTANTIATE_TEST_SUITE_P( ChatAndResponses, HttpOpenAIHandlerChatAndResponsesParsingTest, ::testing::Values(ovms::Endpoint::CHAT_COMPLETIONS, ovms::Endpoint::RESPONSES), [](const testing::TestParamInfo& info) { - switch (info.param) { - case ovms::Endpoint::CHAT_COMPLETIONS: - return "ChatCompletions"; - case ovms::Endpoint::RESPONSES: - return "Responses"; - default: - return "Unknown"; - } + switch (info.param) { + case ovms::Endpoint::CHAT_COMPLETIONS: + return "ChatCompletions"; + case ovms::Endpoint::RESPONSES: + return "Responses"; + default: + return "Unknown"; + } }); static std::vector createHermes3ToolCallTokens(ov::genai::Tokenizer& tokenizer) { From b751e6157f141c872e12e12c091bdb52a5a49f20 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 6 Mar 2026 10:42:11 +0100 Subject: [PATCH 06/24] remove redundant tests --- src/test/http_openai_handler_test.cpp | 65 --------------------------- 1 file changed, 65 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 9f0e3c0441..474cde8f28 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1631,47 +1631,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersCompletions } } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxOutputTokensSetsMaxTokens) { - std::string json = R"({ - "model": "llama", - "input": "valid prompt", - "max_output_tokens": 7 - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); - EXPECT_EQ(apiHandler->getMaxTokens().value(), 7); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesStringInputCreatesUserChatMessage) { - std::string json = R"({ - "model": "llama", - "input": "What is OpenVINO?" - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - - auto& chatHistory = apiHandler->getChatHistory(); - ASSERT_EQ(chatHistory.size(), 1); - ASSERT_TRUE(chatHistory[0].contains("role")); - ASSERT_TRUE(chatHistory[0].contains("content")); - EXPECT_EQ(chatHistory[0]["role"], "user"); - EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); - EXPECT_NE(apiHandler->getProcessedJson().find("\"messages\""), std::string::npos); - EXPECT_NE(apiHandler->getProcessedJson().find("\"role\":\"user\""), std::string::npos); - EXPECT_NE(apiHandler->getProcessedJson().find("\"input\":\"What is OpenVINO?\""), std::string::npos); -} - TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesConflictingOutputAndCompletionTokensFails) { std::string json = R"({ "model": "llama", @@ -1914,30 +1873,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesToolChoiceFunctionObjectNam EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("tool_choice.name is not a valid string")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlStringSucceeds) { - std::string json = R"({ - "model": "llama", - "input": [ - { - "role": "user", - "content": [ - {"type": "input_text", "text": "what is in this image?"}, - {"type": "input_image", "image_url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="} - ] - } - ] - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::optional maxTokensLimit; - uint32_t bestOfLimit = 0; - std::optional maxModelLength; - std::shared_ptr apiHandler = - std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - EXPECT_EQ(apiHandler->getImageHistory().size(), 1); -} - TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlObjectSucceeds) { std::string json = R"({ "model": "llama", From d68afb462dfac41788c36ab6c8c2952af35d5e7a Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 6 Mar 2026 10:45:18 +0100 Subject: [PATCH 07/24] update tools parsing in responses --- src/llm/apis/openai_completions.cpp | 190 +++++++++++----------------- 1 file changed, 75 insertions(+), 115 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index af87a6a971..f06e25bdac 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -220,85 +220,6 @@ std::string serializeResponsesUnaryResponse( return buffer.GetString(); } -absl::Status normalizeResponsesFunctionToolsInPlace(rapidjson::Document& doc) { - auto toolsIt = doc.FindMember("tools"); - if (toolsIt == doc.MemberEnd() || toolsIt->value.IsNull()) { - return absl::OkStatus(); - } - if (!toolsIt->value.IsArray()) { - return absl::InvalidArgumentError("Tools are not an array"); - } - - auto& allocator = doc.GetAllocator(); - for (auto& toolValue : toolsIt->value.GetArray()) { - if (!toolValue.IsObject()) { - return absl::InvalidArgumentError("Tool is not a JSON object"); - } - auto toolObj = toolValue.GetObject(); - auto typeIt = toolObj.FindMember("type"); - if (typeIt == toolObj.MemberEnd() || !typeIt->value.IsString()) { - return absl::InvalidArgumentError("Tool type is missing or invalid"); - } - if (std::string(typeIt->value.GetString()) != "function") { - return absl::InvalidArgumentError("Only function tools are supported"); - } - - auto functionIt = toolObj.FindMember("function"); - if (functionIt != toolObj.MemberEnd()) { - if (!functionIt->value.IsObject()) { - return absl::InvalidArgumentError("Function is not a valid JSON object"); - } - continue; - } - - auto nameIt = toolObj.FindMember("name"); - if (nameIt == toolObj.MemberEnd() || !nameIt->value.IsString()) { - return absl::InvalidArgumentError("Function object does not contain a valid name field"); - } - - rapidjson::Value functionObj(rapidjson::kObjectType); - functionObj.AddMember("name", rapidjson::Value(nameIt->value.GetString(), allocator), allocator); - - auto descriptionIt = toolObj.FindMember("description"); - if (descriptionIt != toolObj.MemberEnd() && descriptionIt->value.IsString()) { - functionObj.AddMember("description", rapidjson::Value(descriptionIt->value.GetString(), allocator), allocator); - } - - auto parametersIt = toolObj.FindMember("parameters"); - if (parametersIt != toolObj.MemberEnd()) { - if (!parametersIt->value.IsObject()) { - return absl::InvalidArgumentError("Function parameters are not a valid JSON object"); - } - rapidjson::Value parametersCopy(rapidjson::kObjectType); - parametersCopy.CopyFrom(parametersIt->value, allocator); - functionObj.AddMember("parameters", parametersCopy, allocator); - } - - toolValue.AddMember("function", functionObj, allocator); - } - - auto toolChoiceIt = doc.FindMember("tool_choice"); - if (toolChoiceIt != doc.MemberEnd() && !toolChoiceIt->value.IsNull() && toolChoiceIt->value.IsObject()) { - auto toolChoiceObj = toolChoiceIt->value.GetObject(); - auto functionIt = toolChoiceObj.FindMember("function"); - if (functionIt == toolChoiceObj.MemberEnd()) { - auto typeIt = toolChoiceObj.FindMember("type"); - auto nameIt = toolChoiceObj.FindMember("name"); - if (typeIt != toolChoiceObj.MemberEnd() && typeIt->value.IsString() && std::string(typeIt->value.GetString()) == "function") { - if (nameIt == toolChoiceObj.MemberEnd() || !nameIt->value.IsString()) { - return absl::InvalidArgumentError("tool_choice.name is not a valid string"); - } - - rapidjson::Value functionObj(rapidjson::kObjectType); - functionObj.AddMember("name", rapidjson::Value(nameIt->value.GetString(), allocator), allocator); - toolChoiceIt->value.AddMember("function", functionObj, allocator); - } - } - } - - return absl::OkStatus(); -} - } // namespace absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { @@ -820,8 +741,9 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") return absl::InvalidArgumentError("tool_choice should be either 'none' or 'auto' or 'required'"); } else if (tool_choice_it->value.IsObject()) { - auto tool_choice_functionIt = tool_choice_it->value.GetObject().FindMember("function"); - if (tool_choice_functionIt != tool_choice_it->value.GetObject().MemberEnd() && tool_choice_functionIt->value.IsObject()) { + auto toolChoiceObj = tool_choice_it->value.GetObject(); + auto tool_choice_functionIt = toolChoiceObj.FindMember("function"); + if (tool_choice_functionIt != toolChoiceObj.MemberEnd() && tool_choice_functionIt->value.IsObject()) { auto nameIt = tool_choice_functionIt->value.GetObject().FindMember("name"); if (nameIt != tool_choice_functionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { tool_choice = nameIt->value.GetString(); @@ -829,7 +751,16 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { return absl::InvalidArgumentError("tool_choice.function.name is not a valid string"); } } else { - return absl::InvalidArgumentError("tool_choice.function is not a valid JSON object"); + auto typeIt = toolChoiceObj.FindMember("type"); + auto nameIt = toolChoiceObj.FindMember("name"); + if (typeIt != toolChoiceObj.MemberEnd() && typeIt->value.IsString() && std::string(typeIt->value.GetString()) == "function") { + if (nameIt == toolChoiceObj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("tool_choice.name is not a valid string"); + } + tool_choice = nameIt->value.GetString(); + } else { + return absl::InvalidArgumentError("tool_choice.function is not a valid JSON object"); + } } } else { return absl::InvalidArgumentError("tool_choice is not a valid JSON object or string"); @@ -849,38 +780,71 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { auto& obj = it->value.GetArray()[i]; if (!obj.IsObject()) return absl::InvalidArgumentError("Tool is not a JSON object"); + const rapidjson::Value* functionObj = nullptr; + const rapidjson::Value* parametersValue = nullptr; + const char* functionNameCStr = nullptr; + auto functionIt = obj.FindMember("function"); - if (functionIt != obj.MemberEnd() && functionIt->value.IsObject()) { - auto nameIt = functionIt->value.GetObject().FindMember("name"); - if (nameIt != functionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { - std::string functionName = nameIt->value.GetString(); - // If tool_choice is set to "auto", we keep all tools - // If tool_choice is set to a specific function name, we keep only that tool - if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { - it->value.Erase(&obj); - jsonChanged = true; - } else { - i++; - // If we keep the tool, add tool name and schema to the request - auto parametersIt = functionIt->value.GetObject().FindMember("parameters"); - if (parametersIt != functionIt->value.GetObject().MemberEnd() && parametersIt->value.IsObject()) { - // now we want to insert to a mapping of - // tool name -> tool schema representations struct - // Dump parameters object to string since this is the schema format expected by GenAI - // Keep the rapidjson::Value object as well to avoid re-parsing in outputParsers - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - parametersIt->value.Accept(writer); - std::string parametersStr = buffer.GetString(); - ToolSchemaWrapper schemaReprs{¶metersIt->value, std::move(parametersStr)}; - request.toolNameSchemaMap[nameIt->value.GetString()] = std::move(schemaReprs); - } - } - } else { + if (functionIt != obj.MemberEnd()) { + if (!functionIt->value.IsObject()) { + return absl::InvalidArgumentError("Function is not a valid JSON object"); + } + functionObj = &functionIt->value; + auto nameIt = functionObj->GetObject().FindMember("name"); + if (nameIt == functionObj->GetObject().MemberEnd() || !nameIt->value.IsString()) { return absl::InvalidArgumentError("Function object does not contain a valid name field"); } + functionNameCStr = nameIt->value.GetString(); + auto parametersIt = functionObj->GetObject().FindMember("parameters"); + if (parametersIt != functionObj->GetObject().MemberEnd()) { + parametersValue = ¶metersIt->value; + } } else { - return absl::InvalidArgumentError("Function is not a valid JSON object"); + auto typeIt = obj.FindMember("type"); + if (typeIt == obj.MemberEnd() || !typeIt->value.IsString()) { + return absl::InvalidArgumentError("Tool type is missing or invalid"); + } + if (std::string(typeIt->value.GetString()) != "function") { + return absl::InvalidArgumentError("Only function tools are supported"); + } + + auto nameIt = obj.FindMember("name"); + if (nameIt == obj.MemberEnd() || !nameIt->value.IsString()) { + return absl::InvalidArgumentError("Function object does not contain a valid name field"); + } + functionNameCStr = nameIt->value.GetString(); + + auto parametersIt = obj.FindMember("parameters"); + if (parametersIt != obj.MemberEnd()) { + parametersValue = ¶metersIt->value; + } + } + + std::string functionName = functionNameCStr; + // If tool_choice is set to "auto", we keep all tools + // If tool_choice is set to a specific function name, we keep only that tool + if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { + it->value.Erase(&obj); + jsonChanged = true; + continue; + } + + i++; + // If we keep the tool, add tool name and schema to the request + if (parametersValue != nullptr) { + if (!parametersValue->IsObject()) { + return absl::InvalidArgumentError("Function parameters are not a valid JSON object"); + } + // now we want to insert to a mapping of + // tool name -> tool schema representations struct + // Dump parameters object to string since this is the schema format expected by GenAI + // Keep the rapidjson::Value object as well to avoid re-parsing in outputParsers + rapidjson::StringBuffer buffer; + rapidjson::Writer writer(buffer); + parametersValue->Accept(writer); + std::string parametersStr = buffer.GetString(); + ToolSchemaWrapper schemaReprs{parametersValue, std::move(parametersStr)}; + request.toolNameSchemaMap[functionNameCStr] = std::move(schemaReprs); } } } else { @@ -1084,11 +1048,7 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optional Date: Fri, 6 Mar 2026 13:56:26 +0100 Subject: [PATCH 08/24] fix --- src/llm/apis/openai_completions.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index f06e25bdac..417a4e9bba 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -780,8 +780,8 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { auto& obj = it->value.GetArray()[i]; if (!obj.IsObject()) return absl::InvalidArgumentError("Tool is not a JSON object"); - const rapidjson::Value* functionObj = nullptr; - const rapidjson::Value* parametersValue = nullptr; + rapidjson::Value* functionObj = nullptr; + rapidjson::Value* parametersValue = nullptr; const char* functionNameCStr = nullptr; auto functionIt = obj.FindMember("function"); From 593c365cddd5ca1b7937006dc4dc4056411780b7 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 16 Mar 2026 13:22:04 +0100 Subject: [PATCH 09/24] fix --- src/llm/apis/openai_completions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 417a4e9bba..0c9272d34b 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1753,7 +1753,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) { OVMS_PROFILE_FUNCTION(); if (endpoint == Endpoint::RESPONSES) { - const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); const std::string responseId = "resp-" + std::to_string(createdAt); const std::string outputItemId = "msg-0"; From d7f84959302b206817db956690f1098530b2e775 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 18 Mar 2026 13:55:14 +0100 Subject: [PATCH 10/24] fix --- demos/continuous_batching/README.md | 99 +++++++++++++++++++- demos/continuous_batching/vlm/README.md | 119 +++++++++++++++++++++++- docs/llm/reference.md | 2 +- src/llm/apis/openai_completions.cpp | 68 ++++++-------- src/llm/apis/openai_completions.hpp | 1 + src/llm/servable.cpp | 5 +- src/test/http_openai_handler_test.cpp | 94 ++++++++++++++++--- 7 files changed, 330 insertions(+), 58 deletions(-) diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index ab8c7be951..fb71ad2504 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -16,7 +16,7 @@ ovms_demos_continuous_batching_accuracy ``` This demo shows how to deploy LLM models in the OpenVINO Model Server using continuous batching and paged attention algorithms. -Text generation use case is exposed via OpenAI API `chat/completions` and `completions` endpoints. +Text generation use case is exposed via OpenAI API `chat/completions`, `completions` and `responses` endpoints. That makes it easy to use and efficient especially on on Intel® Xeon® processors and ARC GPUs. > **Note:** This demo was tested on 4th - 6th generation Intel® Xeon® Scalable Processors, and Intel® Core Ultra Series on Ubuntu24 and Windows11. @@ -72,7 +72,7 @@ curl http://localhost:8000/v3/models ## Request Generation -Model exposes both `chat/completions` and `completions` endpoints with and without stream capabilities. +Model exposes both `chat/completions`, `completions` and `responses` endpoints with and without stream capabilities. Chat endpoint is expected to be used for scenarios where conversation context should be pasted by the client and the model prompt is created by the server based on the jinja model template. Completion endpoint should be used to pass the prompt directly by the client and for models without the jinja template. Here is demonstrated model `Qwen/Qwen3-30B-A3B-Instruct-2507` in int4 precision. It has chat capability so `chat/completions` endpoint will be employed: @@ -147,9 +147,76 @@ curl -s http://localhost:8000/v3/chat/completions -H "Content-Type: application/ ::: +### Unary calls to responses endpoint using cURL + +::::{tab-set} + +:::{tab-item} Linux +```bash +curl http://localhost:8000/v3/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "max_output_tokens":30, + "input": "What is OpenVINO?" + }'| jq . +``` +::: + +:::{tab-item} Windows +Windows Powershell +```powershell +(Invoke-WebRequest -Uri "http://localhost:8000/v3/responses" ` + -Method POST ` + -Headers @{ "Content-Type" = "application/json" } ` + -Body '{"model": "meta-llama/Meta-Llama-3-8B-Instruct", "max_output_tokens": 30, "input": "What is OpenVINO?"}').Content +``` + +Windows Command Prompt +```bat +curl -s http://localhost:8000/v3/responses -H "Content-Type: application/json" -d "{\"model\": \"meta-llama/Meta-Llama-3-8B-Instruct\", \"max_output_tokens\": 30, \"input\": \"What is OpenVINO?\"}" +``` +::: + +:::: + +:::{dropdown} Expected Response +```json +{ + "id": "resp-1724405400", + "object": "response", + "created_at": 1724405400, + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "status": "completed", + "output": [ + { + "id": "msg-0", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [ + { + "type": "output_text", + "text": "OpenVINO is an open-source software framework developed by Intel for optimizing and deploying computer vision, machine learning, and deep learning models on various devices,", + "annotations": [] + } + ] + } + ], + "usage": { + "input_tokens": 27, + "input_tokens_details": { "cached_tokens": 0 }, + "output_tokens": 30, + "output_tokens_details": { "reasoning_tokens": 0 }, + "total_tokens": 57 + } +} +``` +::: + ### OpenAI Python package -The endpoints `chat/completions` and `completions` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: +The endpoints `chat/completions`, `completions` and `responses` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: Install the client library: ```console @@ -261,6 +328,31 @@ So, **6 = 3**. ``` ::: +:::{tab-item} Responses +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:8000/v3", + api_key="unused" +) + +stream = client.responses.create( + model="meta-llama/Meta-Llama-3-8B-Instruct", + input="Say this is a test", + stream=True, +) +for event in stream: + if event.type == "response.output_text.delta": + print(event.delta, end="", flush=True) +``` + +Output: +``` +It looks like you're testing me! +``` +::: + :::: ## Check how to use AI agents with MCP servers and language models @@ -299,5 +391,6 @@ Check the [guide of using lm-evaluation-harness](./accuracy/README.md) - [Official OpenVINO LLM models in HuggingFace](https://huggingface.co/collections/OpenVINO/llm) - [Chat Completions API](../../docs/model_server_rest_api_chat.md) - [Completions API](../../docs/model_server_rest_api_completions.md) +- [Responses API](../../docs/model_server_rest_api_responses.md) - [Writing client code](../../docs/clients_genai.md) - [LLM calculator reference](../../docs/llm/reference.md) diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md index 0de93e27b8..2580360bd3 100644 --- a/demos/continuous_batching/vlm/README.md +++ b/demos/continuous_batching/vlm/README.md @@ -9,7 +9,7 @@ ovms_demos_vlm_npu ``` This demo shows how to deploy Vision Language Models in the OpenVINO Model Server. -Text generation use case is exposed via OpenAI API `chat/completions` endpoint. +Text generation use case is exposed via OpenAI API `chat/completions` and `responses` endpoints. > **Note:** This demo was tested on 4th - 6th generation Intel® Xeon® Scalable Processors, Intel® Arc™ GPU Series and Intel® Core Ultra Series on Ubuntu24, RedHat9 and Windows11. @@ -119,6 +119,45 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js ``` ::: +:::{dropdown} **Unary call with curl using responses endpoint** +**Note**: using urls in request requires `--allowed_media_domains` parameter described [here](../../../docs/parameters.md) + +```bash +curl http://localhost:8000/v3/responses -H "Content-Type: application/json" -d "{ \"model\": \"OpenGVLab/InternVL2-2B\", \"input\":[{\"role\": \"user\", \"content\": [{\"type\": \"input_text\", \"text\": \"Describe what is on the picture.\"},{\"type\": \"input_image\", \"image_url\": \"http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/common/static/images/zebra.jpeg\"}]}], \"max_output_tokens\": 100}" +``` +```json +{ + "id": "resp-1741731554", + "object": "response", + "created_at": 1741731554, + "model": "OpenGVLab/InternVL2-2B", + "status": "completed", + "output": [ + { + "id": "msg-0", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [ + { + "type": "output_text", + "text": "The picture features a zebra standing in a grassy plain. Zebras are known for their distinctive black and white striped patterns, which help them blend in for camouflage purposes.", + "annotations": [] + } + ] + } + ], + "usage": { + "input_tokens": 19, + "input_tokens_details": { "cached_tokens": 0 }, + "output_tokens": 83, + "output_tokens_details": { "reasoning_tokens": 0 }, + "total_tokens": 102 + } +} +``` +::: + :::{dropdown} **Unary call with python requests library** ```console @@ -177,9 +216,9 @@ print(response.text) } ``` ::: -:::{dropdown} **Streaming request with OpenAI client** +:::{dropdown} **Streaming request with OpenAI client using chat/completions** -The endpoints `chat/completions` is compatible with OpenAI client so it can be easily used to generate code also in streaming mode: +The endpoints `chat/completions` and `responses` are compatible with OpenAI client so it can be easily used to generate code also in streaming mode: Install the client library: ```console @@ -223,6 +262,79 @@ The picture features a zebra standing in a grassy area. The zebra is characteriz ::: +:::{dropdown} **Streaming request with OpenAI client using responses endpoint** + +```console +pip3 install openai +``` +```python +from openai import OpenAI +import base64 +base_url='http://localhost:8080/v3' +model_name = "OpenGVLab/InternVL2-2B" + +client = OpenAI(api_key='unused', base_url=base_url) + +def convert_image(Image): + with open(Image,'rb' ) as file: + base64_image = base64.b64encode(file.read()).decode("utf-8") + return base64_image + +stream = client.responses.create( + model=model_name, + input=[ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Describe what is on the picture."}, + {"type": "input_image", "image_url": f"data:image/jpeg;base64,{convert_image('zebra.jpeg')}"} + ] + } + ], + stream=True, +) +for event in stream: + if event.type == "response.output_text.delta": + print(event.delta, end="", flush=True) +``` + +Output: +``` +The picture features a zebra standing in a grassy area. The zebra is characterized by its distinctive black and white striped pattern, which covers its entire body, including its legs, neck, and head. Zebras have small, rounded ears and a long, flowing tail. The background appears to be a natural grassy habitat, typical of a savanna or plain. +``` + +::: + +## Benchmarking text generation with high concurrency + +OpenVINO Model Server employs efficient parallelization for text generation. It can be used to generate text also in high concurrency in the environment shared by multiple clients. +It can be demonstrated using benchmarking app from vLLM repository: +```console +git clone --branch v0.7.3 --depth 1 https://github.com/vllm-project/vllm +cd vllm +pip3 install -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +cd benchmarks +python benchmark_serving.py --backend openai-chat --dataset-name hf --dataset-path lmarena-ai/vision-arena-bench-v0.1 --hf-split train --host localhost --port 8000 --model OpenGVLab/InternVL2-2B --endpoint /v3/chat/completions --max-concurrency 1 --num-prompts 100 --trust-remote-code + +Burstiness factor: 1.0 (Poisson process) +Maximum request concurrency: None +============ Serving Benchmark Result ============ +Successful requests: 100 +Benchmark duration (s): 287.81 +Total input tokens: 15381 +Total generated tokens: 20109 +Request throughput (req/s): 0.35 +Output token throughput (tok/s): 69.87 +Total Token throughput (tok/s): 123.31 +---------------Time to First Token---------------- +Mean TTFT (ms): 1513.96 +Median TTFT (ms): 1368.93 +P99 TTFT (ms): 2647.45 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 6.68 +Median TPOT (ms): 6.68 +P99 TPOT (ms): 8.02 +``` ## Testing the model accuracy over serving API @@ -237,5 +349,6 @@ Check [VLM usage with NPU acceleration](../../vlm_npu/README.md) - [Export models to OpenVINO format](../common/export_models/README.md) - [Supported VLM models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#visual-language-models-vlms) - [Chat Completions API](../../../docs/model_server_rest_api_chat.md) +- [Responses API](../../../docs/model_server_rest_api_responses.md) - [Writing client code](../../../docs/clients_genai.md) - [LLM calculator reference](../../../docs/llm/reference.md) diff --git a/docs/llm/reference.md b/docs/llm/reference.md index 654c9b6d90..222777e4a7 100644 --- a/docs/llm/reference.md +++ b/docs/llm/reference.md @@ -44,7 +44,7 @@ struct HttpPayload { std::shared_ptr client; }; ``` -The input json content should be compatible with the [chat completions](../model_server_rest_api_chat.md) or [completions](../model_server_rest_api_completions.md) API. +The input json content should be compatible with the [chat completions](../model_server_rest_api_chat.md), [completions](../model_server_rest_api_completions.md) or [responses](../model_server_rest_api_responses.md) API. The input also includes a side packet with a reference to `LLM_NODE_RESOURCES` which is a shared object representing an LLM engine. It loads the model, runs the generation cycles and reports the generated results to the LLM calculator via a generation handler. diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 0c9272d34b..588369f4b8 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1053,22 +1053,6 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optional maxCompletionTokens; - std::optional maxOutputTokens; - - // max_completion_tokens: uint; optional - it = doc.FindMember("max_completion_tokens"); - if (it != doc.MemberEnd() && !it->value.IsNull()) { - if (!it->value.IsUint()) { - if (it->value.IsUint64()) - return absl::InvalidArgumentError("max_completion_tokens value can't be greater than 4294967295"); - return absl::InvalidArgumentError("max_completion_tokens is not an unsigned integer"); - } - if (maxTokensLimit.has_value() && it->value.GetUint() > maxTokensLimit.value()) - return absl::InvalidArgumentError(absl::StrCat("max_completion_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); - maxCompletionTokens = it->value.GetUint(); - } - // max_output_tokens: uint; optional // OpenAI Responses API uses this field for output token limit. it = doc.FindMember("max_output_tokens"); @@ -1080,21 +1064,12 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optionalvalue.GetUint() > maxTokensLimit.value()) return absl::InvalidArgumentError(absl::StrCat("max_output_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); - maxOutputTokens = it->value.GetUint(); - } - - if (maxCompletionTokens.has_value() && maxOutputTokens.has_value() && maxCompletionTokens.value() != maxOutputTokens.value()) { - return absl::InvalidArgumentError("max_output_tokens and max_completion_tokens must match when both are provided"); - } - if (maxOutputTokens.has_value()) { - request.maxTokens = maxOutputTokens.value(); - } else if (maxCompletionTokens.has_value()) { - request.maxTokens = maxCompletionTokens.value(); + request.maxTokens = it->value.GetUint(); } - // specific part of max_tokens validation + // specific part of max_output_tokens validation if (request.maxTokens == 0) { - return absl::InvalidArgumentError("max_tokens value should be greater than 0"); + return absl::InvalidArgumentError("max_output_tokens value should be greater than 0"); } // parse response_format @@ -1174,16 +1149,23 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsUint()) { - if (it->value.IsUint64()) - return absl::InvalidArgumentError("max_tokens value can't be greater than 4294967295"); - return absl::InvalidArgumentError("max_tokens is not an unsigned integer"); + // Not applicable for RESPONSES endpoint which uses max_output_tokens instead + if (endpoint != Endpoint::RESPONSES) { + it = doc.FindMember("max_tokens"); + if (it != doc.MemberEnd()) { + if (!it->value.IsUint()) { + if (it->value.IsUint64()) + return absl::InvalidArgumentError("max_tokens value can't be greater than 4294967295"); + return absl::InvalidArgumentError("max_tokens is not an unsigned integer"); + } + if (maxTokensLimit.has_value() && !(it->value.GetUint() < maxTokensLimit.value())) + return absl::InvalidArgumentError(absl::StrCat("max_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); + request.maxTokens = it->value.GetUint(); + } else { + if (maxTokensLimit.has_value()) { + request.maxTokens = maxTokensLimit.value(); + } } - if (maxTokensLimit.has_value() && !(it->value.GetUint() < maxTokensLimit.value())) - return absl::InvalidArgumentError(absl::StrCat("max_tokens exceeds limit provided in graph config: ", maxTokensLimit.value())); - request.maxTokens = it->value.GetUint(); } else { if (maxTokensLimit.has_value()) { request.maxTokens = maxTokensLimit.value(); @@ -1375,6 +1357,7 @@ std::optional OpenAIChatCompletionsHandler::getNumReturnSequences() const { StreamOptions OpenAIChatCompletionsHandler::getStreamOptions() const { return request.streamOptions; } bool OpenAIChatCompletionsHandler::isStream() const { return request.stream; } +Endpoint OpenAIChatCompletionsHandler::getEndpoint() const { return endpoint; } std::string OpenAIChatCompletionsHandler::getModel() const { return request.model; } std::string OpenAIChatCompletionsHandler::getToolChoice() const { return request.toolChoice; } const std::unique_ptr& OpenAIChatCompletionsHandler::getOutputParser() const { return outputParser; } @@ -1928,6 +1911,17 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.EndObject(); })); + events.emplace_back(serializeResponsesEvent([this, &serializeResponseObject](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.in_progress"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponseObject(writer, "in_progress", "", false); + writer.EndObject(); + })); + events.emplace_back(serializeResponsesEvent([this, &outputItemId, &serializeOutputItem](Writer& writer) { writer.StartObject(); writer.String("type"); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 69da9b76d2..9b6be026cc 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -113,6 +113,7 @@ class OpenAIChatCompletionsHandler { std::optional getResponseFormat() const; bool isStream() const; + Endpoint getEndpoint() const; std::string getModel() const; std::string getToolChoice() const; const std::unique_ptr& getOutputParser() const; diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index b9c619057c..3b36aecb94 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -310,7 +310,10 @@ absl::Status GenAiServable::preparePartialResponse(std::shared_ptr 0) { + // For RESPONSES endpoint, always call serializeStreamingChunk so that + // initialization events (response.created, response.in_progress, etc.) + // are emitted immediately, even before the tokenizer produces text. + if (lastTextChunk.size() > 0 || executionContext->apiHandler->getEndpoint() == Endpoint::RESPONSES) { std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 474cde8f28..875e23ed02 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -836,27 +836,60 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContains std::optional maxModelLength; ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - std::string firstChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); - ASSERT_NE(firstChunk.find("\"type\":\"response.created\""), std::string::npos) << firstChunk; - ASSERT_NE(firstChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << firstChunk; - ASSERT_NE(firstChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << firstChunk; - ASSERT_NE(firstChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << firstChunk; - ASSERT_NE(firstChunk.find("\"delta\":\"Hello\""), std::string::npos) << firstChunk; - + // Phase 1: Init events emitted even with empty text (before tokenizer produces output) + std::string initChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.in_progress\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << initChunk; + // No delta event when text is empty + ASSERT_EQ(initChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << initChunk; + + // Verify correct event ordering: created < in_progress < output_item.added < content_part.added + auto createdPos = initChunk.find("\"type\":\"response.created\""); + auto inProgressPos = initChunk.find("\"type\":\"response.in_progress\""); + auto outputItemAddedPos = initChunk.find("\"type\":\"response.output_item.added\""); + auto contentPartAddedPos = initChunk.find("\"type\":\"response.content_part.added\""); + ASSERT_LT(createdPos, inProgressPos) << "response.created must come before response.in_progress"; + ASSERT_LT(inProgressPos, outputItemAddedPos) << "response.in_progress must come before response.output_item.added"; + ASSERT_LT(outputItemAddedPos, contentPartAddedPos) << "response.output_item.added must come before response.content_part.added"; + + // Phase 2: Second call should only contain delta, no repeated init events + std::string secondChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + ASSERT_TRUE(secondChunk.empty()) << "Empty text after init should produce no output: " << secondChunk; + + // Phase 3: Text delta + std::string deltaChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(deltaChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << deltaChunk; + ASSERT_NE(deltaChunk.find("\"delta\":\"Hello\""), std::string::npos) << deltaChunk; + ASSERT_EQ(deltaChunk.find("\"type\":\"response.created\""), std::string::npos) << "No repeated init events: " << deltaChunk; + + // Phase 4: Final chunk with finish reason std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"text\":\"Hello world\""), std::string::npos) << finalChunk; + + // Verify correct event ordering in final chunk: delta < output_text.done < content_part.done < output_item.done < completed + auto deltaPos = finalChunk.find("\"type\":\"response.output_text.delta\""); + auto textDonePos = finalChunk.find("\"type\":\"response.output_text.done\""); + auto partDonePos = finalChunk.find("\"type\":\"response.content_part.done\""); + auto itemDonePos = finalChunk.find("\"type\":\"response.output_item.done\""); + auto completedPos = finalChunk.find("\"type\":\"response.completed\""); + ASSERT_LT(deltaPos, textDonePos) << "delta must come before output_text.done"; + ASSERT_LT(textDonePos, partDonePos) << "output_text.done must come before content_part.done"; + ASSERT_LT(partDonePos, itemDonePos) << "content_part.done must come before output_item.done"; + ASSERT_LT(itemDonePos, completedPos) << "output_item.done must come before response.completed"; } TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingUsageChunkForResponsesIsEmpty) { std::string json = R"({ "model": "llama", "input": "What is OpenVINO?", - "stream": true, - "stream_options": {"include_usage": true} + "stream": true })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); @@ -1631,12 +1664,45 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingRequestWithNullParametersCompletions } } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesConflictingOutputAndCompletionTokensFails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxOutputTokensSetsLimit) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_output_tokens": 42 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + EXPECT_TRUE(apiHandler->getMaxTokens().has_value()); + EXPECT_EQ(apiHandler->getMaxTokens().value(), 42); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxCompletionTokensIsIgnored) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "max_completion_tokens": 50 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + // max_completion_tokens should be ignored for RESPONSES endpoint, so maxTokens should not be 50 + EXPECT_FALSE(apiHandler->getMaxTokens().has_value() && apiHandler->getMaxTokens().value() == 50); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxTokensIsIgnored) { std::string json = R"({ "model": "llama", "input": "valid prompt", - "max_output_tokens": 5, - "max_completion_tokens": 7 + "max_tokens": 50 })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); @@ -1644,7 +1710,9 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesConflictingOutputAndComplet uint32_t bestOfLimit = 0; std::optional maxModelLength; std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("max_output_tokens and max_completion_tokens must match when both are provided")); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + // max_tokens should be ignored for RESPONSES endpoint, so maxTokens should not be 50 + EXPECT_FALSE(apiHandler->getMaxTokens().has_value() && apiHandler->getMaxTokens().value() == 50); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesFlatFunctionToolsSucceeds) { From d94ac47b30bdb74485f631859fda5c61751be8fa Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 18 Mar 2026 09:03:04 +0100 Subject: [PATCH 11/24] fix --- src/llm/apis/openai_completions.cpp | 389 +++++++++++--------------- src/llm/apis/openai_completions.hpp | 10 + src/test/http_openai_handler_test.cpp | 32 +++ 3 files changed, 209 insertions(+), 222 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 588369f4b8..568bad4551 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -96,46 +96,155 @@ ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& v throw std::invalid_argument("Unsupported JSON value type"); } -std::string serializeResponsesUnaryResponse( - const std::vector& parsedOutputs, - const CompletionUsageStatistics& usage, - const OpenAIChatCompletionsRequest& request, - const ToolsSchemas_t& toolNameSchemaMap, - std::chrono::time_point created) { - const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); - const std::string responseId = "resp-" + std::to_string(createdAt); +std::string serializeResponsesEvent(const std::function&)>& eventSerializer) { + StringBuffer eventBuffer; + Writer eventWriter(eventBuffer); + eventSerializer(eventWriter); + return std::string(eventBuffer.GetString()); +} - auto serializeResponsesToolChoice = [&request](Writer& writer) { - writer.String("tool_choice"); - if (request.toolChoice.empty()) { - writer.String("auto"); - } else if (request.toolChoice == "auto" || request.toolChoice == "none" || request.toolChoice == "required") { - writer.String(request.toolChoice.c_str()); - } else { - writer.StartObject(); - writer.String("type"); - writer.String("function"); - writer.String("name"); - writer.String(request.toolChoice.c_str()); - writer.EndObject(); - } - }; +} // namespace + +void OpenAIChatCompletionsHandler::serializeResponsesToolChoice(Writer& writer) const { + writer.String("tool_choice"); + if (request.toolChoice.empty()) { + writer.String("auto"); + } else if (request.toolChoice == "auto" || request.toolChoice == "none" || request.toolChoice == "required") { + writer.String(request.toolChoice.c_str()); + } else { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(request.toolChoice.c_str()); + writer.EndObject(); + } +} + +void OpenAIChatCompletionsHandler::serializeResponsesTools(Writer& writer) const { + writer.String("tools"); + writer.StartArray(); + for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) { + writer.StartObject(); + writer.String("type"); + writer.String("function"); + writer.String("name"); + writer.String(toolName.c_str()); + writer.String("parameters"); + writer.RawValue(toolSchemaWrapper.stringRepr.c_str(), toolSchemaWrapper.stringRepr.size(), rapidjson::kObjectType); + writer.EndObject(); + } + writer.EndArray(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, + const char* status, const std::string& fullOutputText, bool includeUsage) const { + writer.StartObject(); + writer.String("id"); + writer.String(responseId.c_str()); + writer.String("object"); + writer.String("response"); + writer.String("created_at"); + writer.Int64(createdAt); + if (std::string(status) == "completed") { + writer.String("completed_at"); + writer.Int64(createdAt); + } + writer.String("model"); + writer.String(request.model.c_str()); + writer.String("status"); + writer.String(status); - auto serializeResponsesTools = [&toolNameSchemaMap](Writer& writer) { - writer.String("tools"); + writer.String("parallel_tool_calls"); + writer.Bool(false); + serializeResponsesToolChoice(writer); + serializeResponsesTools(writer); + + if (request.maxTokens.has_value()) { + writer.String("max_output_tokens"); + writer.Uint64(static_cast(request.maxTokens.value())); + } + + writer.String("output"); + writer.StartArray(); + if (!fullOutputText.empty()) { + writer.StartObject(); + writer.String("id"); + writer.String("msg-0"); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(std::string(status) == "completed" ? "completed" : "in_progress"); + writer.String("content"); writer.StartArray(); - for (const auto& [toolName, toolSchemaWrapper] : toolNameSchemaMap) { - writer.StartObject(); - writer.String("type"); - writer.String("function"); - writer.String("name"); - writer.String(toolName.c_str()); - writer.String("parameters"); - writer.RawValue(toolSchemaWrapper.stringRepr.c_str(), toolSchemaWrapper.stringRepr.size(), rapidjson::kObjectType); - writer.EndObject(); - } + serializeResponsesPart(writer, fullOutputText); writer.EndArray(); - }; + writer.EndObject(); + } + writer.EndArray(); + + if (includeUsage) { + writer.String("usage"); + writer.StartObject(); + writer.String("input_tokens"); + writer.Uint64(static_cast(usage.promptTokens)); + writer.String("input_tokens_details"); + writer.StartObject(); + writer.String("cached_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("output_tokens"); + writer.Uint64(static_cast(usage.completionTokens)); + writer.String("output_tokens_details"); + writer.StartObject(); + writer.String("reasoning_tokens"); + writer.Uint64(0); + writer.EndObject(); + writer.String("total_tokens"); + writer.Uint64(static_cast(usage.calculateTotalTokens())); + writer.EndObject(); + } + + writer.EndObject(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, + const std::string& text, const char* status, bool withContent) { + writer.StartObject(); + writer.String("id"); + writer.String(outputItemId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(status); + writer.String("content"); + writer.StartArray(); + if (withContent) { + serializeResponsesPart(writer, text); + } + writer.EndArray(); + writer.EndObject(); +} + +void OpenAIChatCompletionsHandler::serializeResponsesPart(Writer& writer, const std::string& text) { + writer.StartObject(); + writer.String("type"); + writer.String("output_text"); + writer.String("text"); + writer.String(text.c_str()); + writer.String("annotations"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const std::vector& parsedOutputs) const { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); StringBuffer buffer; Writer writer(buffer); @@ -181,15 +290,7 @@ std::string serializeResponsesUnaryResponse( writer.String("completed"); writer.String("content"); writer.StartArray(); - writer.StartObject(); - writer.String("type"); - writer.String("output_text"); - writer.String("text"); - writer.String(parsedOutput.content.c_str()); - writer.String("annotations"); - writer.StartArray(); - writer.EndArray(); - writer.EndObject(); + serializeResponsesPart(writer, parsedOutput.content); writer.EndArray(); writer.EndObject(); } @@ -220,8 +321,6 @@ std::string serializeResponsesUnaryResponse( return buffer.GetString(); } -} // namespace - absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { // prompt: string auto it = doc.FindMember("prompt"); @@ -1292,6 +1391,7 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { if (!it->value.IsUint()) @@ -1307,12 +1407,15 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.IsNull()) { if (!it->value.IsUint()) return absl::InvalidArgumentError("n is not an unsigned integer"); if (it->value.GetUint() == 0) return absl::InvalidArgumentError("n value should be greater than 0"); + if (endpoint == Endpoint::RESPONSES && request.stream && it->value.GetUint() > 1) + return absl::InvalidArgumentError("n greater than 1 is not supported for responses streaming"); size_t bestOf = request.bestOf.has_value() ? request.bestOf.value() : 1; // 1 is default best_of value if (bestOf < it->value.GetUint()) { return absl::InvalidArgumentError("n value cannot be greater than best_of"); @@ -1441,7 +1544,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect updateUsage(usage, generationOutput.generated_ids, request.echo); parsedOutputs.push_back(parseOutputIfNeeded(generationOutput.generated_ids)); } - return serializeResponsesUnaryResponse(parsedOutputs, usage, request, request.toolNameSchemaMap, created); + return serializeResponsesUnaryResponse(parsedOutputs); } OpenAiJsonResponse jsonResponse; @@ -1575,7 +1678,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco updateUsage(usage, tokens, request.echo); parsedOutputs.push_back(parseOutputIfNeeded(tokens)); } - return serializeResponsesUnaryResponse(parsedOutputs, usage, request, request.toolNameSchemaMap, created); + return serializeResponsesUnaryResponse(parsedOutputs); } OpenAiJsonResponse jsonResponse; @@ -1656,7 +1759,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD updateUsage(usage, generatedTokens, request.echo); parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens)); } - return serializeResponsesUnaryResponse(parsedOutputs, usage, request, request.toolNameSchemaMap, created); + return serializeResponsesUnaryResponse(parsedOutputs); } OpenAiJsonResponse jsonResponse; @@ -1740,189 +1843,31 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str const std::string responseId = "resp-" + std::to_string(createdAt); const std::string outputItemId = "msg-0"; - auto serializeResponsesToolChoice = [this](Writer& writer) { - writer.String("tool_choice"); - if (request.toolChoice.empty()) { - writer.String("auto"); - } else if (request.toolChoice == "auto" || request.toolChoice == "none" || request.toolChoice == "required") { - writer.String(request.toolChoice.c_str()); - } else { - writer.StartObject(); - writer.String("type"); - writer.String("function"); - writer.String("name"); - writer.String(request.toolChoice.c_str()); - writer.EndObject(); - } - }; - - auto serializeResponsesTools = [this](Writer& writer) { - writer.String("tools"); - writer.StartArray(); - for (const auto& [toolName, toolSchemaWrapper] : request.toolNameSchemaMap) { - writer.StartObject(); - writer.String("type"); - writer.String("function"); - writer.String("name"); - writer.String(toolName.c_str()); - writer.String("parameters"); - writer.RawValue(toolSchemaWrapper.stringRepr.c_str(), toolSchemaWrapper.stringRepr.size(), rapidjson::kObjectType); - writer.EndObject(); - } - writer.EndArray(); - }; - - auto serializeResponseObject = [this, &responseId, createdAt, &serializeResponsesToolChoice, &serializeResponsesTools](Writer& writer, const char* status, const std::string& fullOutputText, bool includeUsage) { - writer.StartObject(); - writer.String("id"); - writer.String(responseId.c_str()); - writer.String("object"); - writer.String("response"); - writer.String("created_at"); - writer.Int64(createdAt); - if (std::string(status) == "completed") { - writer.String("completed_at"); - writer.Int64(createdAt); - } - writer.String("model"); - writer.String(request.model.c_str()); - writer.String("status"); - writer.String(status); - - writer.String("parallel_tool_calls"); - writer.Bool(false); - serializeResponsesToolChoice(writer); - serializeResponsesTools(writer); - - if (request.maxTokens.has_value()) { - writer.String("max_output_tokens"); - writer.Uint64(static_cast(request.maxTokens.value())); - } - - writer.String("output"); - writer.StartArray(); - if (!fullOutputText.empty()) { - writer.StartObject(); - writer.String("id"); - writer.String("msg-0"); - writer.String("type"); - writer.String("message"); - writer.String("role"); - writer.String("assistant"); - writer.String("status"); - writer.String(std::string(status) == "completed" ? "completed" : "in_progress"); - writer.String("content"); - writer.StartArray(); - writer.StartObject(); - writer.String("type"); - writer.String("output_text"); - writer.String("text"); - writer.String(fullOutputText.c_str()); - writer.String("annotations"); - writer.StartArray(); - writer.EndArray(); - writer.EndObject(); - writer.EndArray(); - writer.EndObject(); - } - writer.EndArray(); - - if (includeUsage) { - writer.String("usage"); - writer.StartObject(); - writer.String("input_tokens"); - writer.Uint64(static_cast(usage.promptTokens)); - writer.String("input_tokens_details"); - writer.StartObject(); - writer.String("cached_tokens"); - writer.Uint64(0); - writer.EndObject(); - writer.String("output_tokens"); - writer.Uint64(static_cast(usage.completionTokens)); - writer.String("output_tokens_details"); - writer.StartObject(); - writer.String("reasoning_tokens"); - writer.Uint64(0); - writer.EndObject(); - writer.String("total_tokens"); - writer.Uint64(static_cast(usage.calculateTotalTokens())); - writer.EndObject(); - } - - writer.EndObject(); - }; - - auto serializeOutputItem = [&outputItemId](Writer& writer, const std::string& text, const char* status, bool withContent) { - writer.StartObject(); - writer.String("id"); - writer.String(outputItemId.c_str()); - writer.String("type"); - writer.String("message"); - writer.String("role"); - writer.String("assistant"); - writer.String("status"); - writer.String(status); - writer.String("content"); - writer.StartArray(); - if (withContent) { - writer.StartObject(); - writer.String("type"); - writer.String("output_text"); - writer.String("text"); - writer.String(text.c_str()); - writer.String("annotations"); - writer.StartArray(); - writer.EndArray(); - writer.EndObject(); - } - writer.EndArray(); - writer.EndObject(); - }; - - auto serializePart = [](Writer& writer, const std::string& text) { - writer.StartObject(); - writer.String("type"); - writer.String("output_text"); - writer.String("text"); - writer.String(text.c_str()); - writer.String("annotations"); - writer.StartArray(); - writer.EndArray(); - writer.EndObject(); - }; - - auto serializeResponsesEvent = [](const std::function&)>& eventSerializer) { - StringBuffer eventBuffer; - Writer eventWriter(eventBuffer); - eventSerializer(eventWriter); - return std::string(eventBuffer.GetString()); - }; - std::vector events; if (!responsesStreamingInitialized) { - events.emplace_back(serializeResponsesEvent([this, &serializeResponseObject](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { writer.StartObject(); writer.String("type"); writer.String("response.created"); writer.String("sequence_number"); writer.Uint64(responsesStreamingSequenceNumber++); writer.String("response"); - serializeResponseObject(writer, "in_progress", "", false); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &serializeResponseObject](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { writer.StartObject(); writer.String("type"); writer.String("response.in_progress"); writer.String("sequence_number"); writer.Uint64(responsesStreamingSequenceNumber++); writer.String("response"); - serializeResponseObject(writer, "in_progress", "", false); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &outputItemId, &serializeOutputItem](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { writer.StartObject(); writer.String("type"); writer.String("response.output_item.added"); @@ -1931,11 +1876,11 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.String("output_index"); writer.Uint64(0); writer.String("item"); - serializeOutputItem(writer, "", "in_progress", false); + serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &outputItemId, &serializePart](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { writer.StartObject(); writer.String("type"); writer.String("response.content_part.added"); @@ -1948,7 +1893,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.String("item_id"); writer.String(outputItemId.c_str()); writer.String("part"); - serializePart(writer, ""); + serializeResponsesPart(writer, ""); writer.EndObject(); })); @@ -1999,7 +1944,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &outputItemId, &serializePart](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { writer.StartObject(); writer.String("type"); writer.String("response.content_part.done"); @@ -2012,11 +1957,11 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.String("item_id"); writer.String(outputItemId.c_str()); writer.String("part"); - serializePart(writer, responsesStreamingOutputText); + serializeResponsesPart(writer, responsesStreamingOutputText); writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &serializeOutputItem](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { writer.StartObject(); writer.String("type"); writer.String("response.output_item.done"); @@ -2025,18 +1970,18 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.String("output_index"); writer.Uint64(0); writer.String("item"); - serializeOutputItem(writer, responsesStreamingOutputText, "completed", true); + serializeResponsesOutputItem(writer, outputItemId, responsesStreamingOutputText, "completed", true); writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &serializeResponseObject](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { writer.StartObject(); writer.String("type"); writer.String("response.completed"); writer.String("sequence_number"); writer.Uint64(responsesStreamingSequenceNumber++); writer.String("response"); - serializeResponseObject(writer, "completed", responsesStreamingOutputText, true); + serializeResponsesResponseObject(writer, responseId, createdAt, "completed", responsesStreamingOutputText, true); writer.EndObject(); })); } diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 9b6be026cc..352ab709b5 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -87,6 +87,16 @@ class OpenAIChatCompletionsHandler { ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); absl::Status ensureArgumentsInToolCalls(Value& messageObj, bool& jsonChanged); + // Responses API serialization helpers + void serializeResponsesToolChoice(Writer& writer) const; + void serializeResponsesTools(Writer& writer) const; + void serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, + const char* status, const std::string& fullOutputText, bool includeUsage) const; + static void serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, + const std::string& text, const char* status, bool withContent); + static void serializeResponsesPart(Writer& writer, const std::string& text); + std::string serializeResponsesUnaryResponse(const std::vector& parsedOutputs) const; + public: OpenAIChatCompletionsHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, ov::genai::Tokenizer tokenizer, const std::string& toolParserName = "", const std::string& reasoningParserName = "") : diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 875e23ed02..e4e150d9e8 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1715,6 +1715,38 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesMaxTokensIsIgnored) { EXPECT_FALSE(apiHandler->getMaxTokens().has_value() && apiHandler->getMaxTokens().value() == 50); } +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNStreamingIsRejected) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "stream": true, + "n": 2 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("n greater than 1 is not supported for responses streaming")); +} + +TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNUnaryIsAccepted) { + std::string json = R"({ + "model": "llama", + "input": "valid prompt", + "best_of": 3, + "n": 2 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 100; + std::optional maxModelLength; + std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); +} + TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesFlatFunctionToolsSucceeds) { std::string json = R"({ "model": "llama", From 5d3ecd37e72537fa8f7423df2457b07ec6f280e5 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 20 Mar 2026 09:48:51 +0100 Subject: [PATCH 12/24] fixes --- demos/continuous_batching/README.md | 2 +- demos/continuous_batching/vlm/README.md | 59 ++-- src/llm/BUILD | 1 + src/llm/apis/openai_completions.cpp | 396 +++++++++++++++++------- src/llm/apis/openai_completions.hpp | 12 +- src/llm/http_llm_calculator.cc | 30 ++ src/llm/py_jinja_template_processor.cpp | 113 +++++++ src/llm/py_jinja_template_processor.hpp | 17 + src/llm/servable.cpp | 12 +- src/test/http_openai_handler_test.cpp | 292 ++++++++++++++++- src/test/llm/llmtemplate_test.cpp | 58 ++++ 11 files changed, 819 insertions(+), 173 deletions(-) diff --git a/demos/continuous_batching/README.md b/demos/continuous_batching/README.md index fb71ad2504..550ab2f516 100644 --- a/demos/continuous_batching/README.md +++ b/demos/continuous_batching/README.md @@ -147,7 +147,7 @@ curl -s http://localhost:8000/v3/chat/completions -H "Content-Type: application/ ::: -### Unary calls to responses endpoint using cURL +### Unary calls via Responses API using cURL ::::{tab-set} diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md index 2580360bd3..0436bb4dd7 100644 --- a/demos/continuous_batching/vlm/README.md +++ b/demos/continuous_batching/vlm/README.md @@ -119,11 +119,31 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/js ``` ::: -:::{dropdown} **Unary call with curl using responses endpoint** -**Note**: using urls in request requires `--allowed_media_domains` parameter described [here](../../../docs/parameters.md) +:::{dropdown} **Unary call with cURL using Responses API** +**Note**: Using urls in request requires `--allowed_media_domains` parameter described [here](../../../docs/parameters.md) ```bash -curl http://localhost:8000/v3/responses -H "Content-Type: application/json" -d "{ \"model\": \"OpenGVLab/InternVL2-2B\", \"input\":[{\"role\": \"user\", \"content\": [{\"type\": \"input_text\", \"text\": \"Describe what is on the picture.\"},{\"type\": \"input_image\", \"image_url\": \"http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/common/static/images/zebra.jpeg\"}]}], \"max_output_tokens\": 100}" +curl http://localhost:8000/v3/responses \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenGVLab/InternVL2-2B", + "input": [ + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": "Describe what is on the picture." + }, + { + "type": "input_image", + "image_url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/3/demos/common/static/images/zebra.jpeg" + } + ] + } + ], + "max_output_tokens": 100 + }' ``` ```json { @@ -262,7 +282,7 @@ The picture features a zebra standing in a grassy area. The zebra is characteriz ::: -:::{dropdown} **Streaming request with OpenAI client using responses endpoint** +:::{dropdown} **Streaming request with OpenAI client via Responses API** ```console pip3 install openai @@ -305,37 +325,6 @@ The picture features a zebra standing in a grassy area. The zebra is characteriz ::: -## Benchmarking text generation with high concurrency - -OpenVINO Model Server employs efficient parallelization for text generation. It can be used to generate text also in high concurrency in the environment shared by multiple clients. -It can be demonstrated using benchmarking app from vLLM repository: -```console -git clone --branch v0.7.3 --depth 1 https://github.com/vllm-project/vllm -cd vllm -pip3 install -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu -cd benchmarks -python benchmark_serving.py --backend openai-chat --dataset-name hf --dataset-path lmarena-ai/vision-arena-bench-v0.1 --hf-split train --host localhost --port 8000 --model OpenGVLab/InternVL2-2B --endpoint /v3/chat/completions --max-concurrency 1 --num-prompts 100 --trust-remote-code - -Burstiness factor: 1.0 (Poisson process) -Maximum request concurrency: None -============ Serving Benchmark Result ============ -Successful requests: 100 -Benchmark duration (s): 287.81 -Total input tokens: 15381 -Total generated tokens: 20109 -Request throughput (req/s): 0.35 -Output token throughput (tok/s): 69.87 -Total Token throughput (tok/s): 123.31 ----------------Time to First Token---------------- -Mean TTFT (ms): 1513.96 -Median TTFT (ms): 1368.93 -P99 TTFT (ms): 2647.45 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 6.68 -Median TPOT (ms): 6.68 -P99 TPOT (ms): 8.02 -``` - ## Testing the model accuracy over serving API Check the [guide of using lm-evaluation-harness](../accuracy/README.md) diff --git a/src/llm/BUILD b/src/llm/BUILD index ae37d936ca..bc41306ed7 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -252,6 +252,7 @@ ovms_cc_library( "//third_party:openvino", "//src:libovmslogging", "//src/python:utils", + "//src/port:rapidjson_document", ] + PYBIND_DEPS, visibility = ["//visibility:public"], additional_copts = COPTS_PYTHON diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 568bad4551..f879d39195 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -103,6 +103,22 @@ std::string serializeResponsesEvent(const std::function& writer, const char* fieldName) { + writer.String(fieldName); + writer.Null(); +} + +void serializeNotSupportedZeroField(Writer& writer, const char* fieldName) { + writer.String(fieldName); + writer.Uint64(0); +} + +void serializeNotSupportedEmptyArrayField(Writer& writer, const char* fieldName) { + writer.String(fieldName); + writer.StartArray(); + writer.EndArray(); +} + } // namespace void OpenAIChatCompletionsHandler::serializeResponsesToolChoice(Writer& writer) const { @@ -138,7 +154,8 @@ void OpenAIChatCompletionsHandler::serializeResponsesTools(Writer& } void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, - const char* status, const std::string& fullOutputText, bool includeUsage) const { + const char* status, const std::string& fullOutputText, bool includeUsage, + const char* incompleteReason, const char* errorMessage, const char* errorCode) const { writer.StartObject(); writer.String("id"); writer.String(responseId.c_str()); @@ -147,8 +164,27 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(std::chrono::system_clock::now().time_since_epoch()).count(); writer.String("completed_at"); - writer.Int64(createdAt); + writer.Int64(completedAt); + } + if (incompleteReason != nullptr) { + writer.String("incomplete_details"); + writer.StartObject(); + writer.String("reason"); + writer.String(incompleteReason); + writer.EndObject(); + } + writer.String("error"); + if (errorMessage != nullptr) { + writer.StartObject(); + writer.String("code"); + writer.String(errorCode != nullptr ? errorCode : "server_error"); + writer.String("message"); + writer.String(errorMessage); + writer.EndObject(); + } else { + writer.Null(); } writer.String("model"); writer.String(request.model.c_str()); @@ -157,8 +193,38 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(request.temperature.value())); + } else { + writer.Double(1.0); + } + writer.String("text"); + writer.StartObject(); + writer.String("format"); + writer.StartObject(); + writer.String("type"); + writer.String("text"); + writer.EndObject(); + writer.EndObject(); serializeResponsesToolChoice(writer); serializeResponsesTools(writer); + writer.String("top_p"); + if (request.topP.has_value()) { + writer.Double(static_cast(request.topP.value())); + } else { + writer.Double(1.0); + } + writer.String("truncation"); + writer.String("disabled"); + serializeNotSupportedNullField(writer, "user"); + writer.String("metadata"); + writer.StartObject(); + writer.EndObject(); if (request.maxTokens.has_value()) { writer.String("max_output_tokens"); @@ -176,7 +242,13 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(usage.promptTokens)); writer.String("input_tokens_details"); writer.StartObject(); - writer.String("cached_tokens"); - writer.Uint64(0); + serializeNotSupportedZeroField(writer, "cached_tokens"); writer.EndObject(); writer.String("output_tokens"); writer.Uint64(static_cast(usage.completionTokens)); writer.String("output_tokens_details"); writer.StartObject(); - writer.String("reasoning_tokens"); - writer.Uint64(0); + serializeNotSupportedZeroField(writer, "reasoning_tokens"); writer.EndObject(); writer.String("total_tokens"); writer.Uint64(static_cast(usage.calculateTotalTokens())); @@ -242,8 +312,12 @@ void OpenAIChatCompletionsHandler::serializeResponsesPart(Writer& writer.EndObject(); } -std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const std::vector& parsedOutputs) const { +std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const std::vector& parsedOutputs, + ov::genai::GenerationFinishReason finishReason) const { + const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); + const char* responseStatus = isIncomplete ? "incomplete" : "completed"; const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const auto completedAt = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); const std::string responseId = "resp-" + std::to_string(createdAt); StringBuffer buffer; @@ -256,17 +330,57 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.String("response"); writer.String("created_at"); writer.Int64(createdAt); - writer.String("completed_at"); - writer.Int64(createdAt); + if (!isIncomplete) { + writer.String("completed_at"); + writer.Int64(completedAt); + } + if (isIncomplete) { + writer.String("incomplete_details"); + writer.StartObject(); + writer.String("reason"); + writer.String("max_tokens"); + writer.EndObject(); + } + serializeNotSupportedNullField(writer, "error"); writer.String("model"); writer.String(request.model.c_str()); writer.String("status"); - writer.String("completed"); + writer.String(responseStatus); writer.String("parallel_tool_calls"); writer.Bool(false); + serializeNotSupportedNullField(writer, "previous_response_id"); + serializeNotSupportedNullField(writer, "reasoning"); + writer.String("store"); + writer.Bool(true); + writer.String("temperature"); + if (request.temperature.has_value()) { + writer.Double(static_cast(request.temperature.value())); + } else { + writer.Double(1.0); + } + writer.String("text"); + writer.StartObject(); + writer.String("format"); + writer.StartObject(); + writer.String("type"); + writer.String("text"); + writer.EndObject(); + writer.EndObject(); serializeResponsesToolChoice(writer); serializeResponsesTools(writer); + writer.String("top_p"); + if (request.topP.has_value()) { + writer.Double(static_cast(request.topP.value())); + } else { + writer.Double(1.0); + } + writer.String("truncation"); + writer.String("disabled"); + serializeNotSupportedNullField(writer, "user"); + writer.String("metadata"); + writer.StartObject(); + writer.EndObject(); if (request.maxTokens.has_value()) { writer.String("max_output_tokens"); @@ -287,7 +401,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.String("role"); writer.String("assistant"); writer.String("status"); - writer.String("completed"); + writer.String(responseStatus); writer.String("content"); writer.StartArray(); serializeResponsesPart(writer, parsedOutput.content); @@ -302,15 +416,13 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.Uint64(static_cast(usage.promptTokens)); writer.String("input_tokens_details"); writer.StartObject(); - writer.String("cached_tokens"); - writer.Uint64(0); + serializeNotSupportedZeroField(writer, "cached_tokens"); writer.EndObject(); writer.String("output_tokens"); writer.Uint64(static_cast(usage.completionTokens)); writer.String("output_tokens_details"); writer.StartObject(); - writer.String("reasoning_tokens"); - writer.Uint64(0); + serializeNotSupportedZeroField(writer, "reasoning_tokens"); writer.EndObject(); writer.String("total_tokens"); writer.Uint64(static_cast(usage.calculateTotalTokens())); @@ -477,23 +589,15 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesInputDirectly(std::opti return absl::InvalidArgumentError("input missing in request"); } - auto& allocator = doc.GetAllocator(); - rapidjson::Value messages(rapidjson::kArrayType); - if (inputIt->value.IsString()) { request.prompt = inputIt->value.GetString(); - if (!request.prompt.has_value() || request.prompt.value().empty()) { + if (request.prompt.value().empty()) { return absl::InvalidArgumentError("input cannot be empty"); } request.chatHistory.push_back({}); request.chatHistory.last()["role"] = "user"; request.chatHistory.last()["content"] = request.prompt.value(); - - rapidjson::Value messageObj(rapidjson::kObjectType); - messageObj.AddMember("role", "user", allocator); - messageObj.AddMember("content", rapidjson::Value(request.prompt->c_str(), allocator), allocator); - messages.PushBack(messageObj, allocator); } else if (inputIt->value.IsArray()) { if (inputIt->value.GetArray().Size() == 0) { return absl::InvalidArgumentError("Messages array cannot be empty"); @@ -514,18 +618,13 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesInputDirectly(std::opti request.chatHistory.push_back({}); request.chatHistory.last()["role"] = roleIt->value.GetString(); - rapidjson::Value messageObj(rapidjson::kObjectType); - messageObj.AddMember("role", rapidjson::Value(roleIt->value.GetString(), allocator), allocator); - auto contentIt = itemObj.FindMember("content"); if (contentIt == itemObj.MemberEnd()) { return absl::InvalidArgumentError("input item content is missing"); } if (contentIt->value.IsString()) { - messageObj.AddMember("content", rapidjson::Value(contentIt->value.GetString(), allocator), allocator); request.chatHistory.last()["content"] = contentIt->value.GetString(); - messages.PushBack(messageObj, allocator); continue; } @@ -635,26 +734,17 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesInputDirectly(std::opti } request.imageHistory.push_back({i, tensor}); } else { - return absl::InvalidArgumentError("Unsupported content type"); + return absl::InvalidArgumentError("Unsupported content type. Supported types are input_text and input_image."); } } - messageObj.AddMember("content", rapidjson::Value(contentText.c_str(), allocator), allocator); request.chatHistory.last()["content"] = contentText; - messages.PushBack(messageObj, allocator); } } else { return absl::InvalidArgumentError("input is not a string or array"); } - auto existingMessages = doc.FindMember("messages"); - if (existingMessages != doc.MemberEnd()) { - existingMessages->value = messages; - } else { - doc.AddMember("messages", messages, allocator); - } - - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed responses input directly to chat history"); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed responses input directly to chat history without mutating request JSON"); return absl::OkStatus(); } @@ -943,7 +1033,7 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { parametersValue->Accept(writer); std::string parametersStr = buffer.GetString(); ToolSchemaWrapper schemaReprs{parametersValue, std::move(parametersStr)}; - request.toolNameSchemaMap[functionNameCStr] = std::move(schemaReprs); + request.toolNameSchemaMap[functionName] = std::move(schemaReprs); } } } else { @@ -1009,6 +1099,22 @@ const bool OpenAIChatCompletionsHandler::areToolsAvailable() const { return !request.toolNameSchemaMap.empty(); } +const rapidjson::Value* OpenAIChatCompletionsHandler::getRawTools() const { + auto it = doc.FindMember("tools"); + if (it == doc.MemberEnd() || it->value.IsNull()) { + return nullptr; + } + return &it->value; +} + +const rapidjson::Value* OpenAIChatCompletionsHandler::getRawChatTemplateKwargs() const { + auto it = doc.FindMember("chat_template_kwargs"); + if (it == doc.MemberEnd() || it->value.IsNull()) { + return nullptr; + } + return &it->value; +} + const OpenAIChatCompletionsRequest& OpenAIChatCompletionsHandler::getRequest() const { return request; } @@ -1182,13 +1288,6 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optional writer(buffer); - doc.Accept(writer); - request.processedJson = buffer.GetString(); - } - return absl::OkStatus(); } @@ -1415,7 +1514,7 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalvalue.GetUint() == 0) return absl::InvalidArgumentError("n value should be greater than 0"); if (endpoint == Endpoint::RESPONSES && request.stream && it->value.GetUint() > 1) - return absl::InvalidArgumentError("n greater than 1 is not supported for responses streaming"); + return absl::InvalidArgumentError("n greater than 1 is not supported for Responses API streaming"); size_t bestOf = request.bestOf.has_value() ? request.bestOf.value() : 1; // 1 is default best_of value if (bestOf < it->value.GetUint()) { return absl::InvalidArgumentError("n value cannot be greater than best_of"); @@ -1540,11 +1639,15 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect if (endpoint == Endpoint::RESPONSES) { std::vector parsedOutputs; usage.completionTokens = 0; + ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; for (const ov::genai::GenerationOutput& generationOutput : generationOutputs) { updateUsage(usage, generationOutput.generated_ids, request.echo); parsedOutputs.push_back(parseOutputIfNeeded(generationOutput.generated_ids)); + if (generationOutput.finish_reason == ov::genai::GenerationFinishReason::LENGTH) { + responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH; + } } - return serializeResponsesUnaryResponse(parsedOutputs); + return serializeResponsesUnaryResponse(parsedOutputs, responsesFinishReason); } OpenAiJsonResponse jsonResponse; @@ -1675,7 +1778,6 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco if (endpoint == Endpoint::RESPONSES) { std::vector parsedOutputs; for (const auto& tokens : results.tokens) { - updateUsage(usage, tokens, request.echo); parsedOutputs.push_back(parseOutputIfNeeded(tokens)); } return serializeResponsesUnaryResponse(parsedOutputs); @@ -1836,6 +1938,75 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD return jsonResponse.ToString(); } +std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents() { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + const std::string outputItemId = "msg-0"; + + std::vector events; + + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.created"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.in_progress"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.output_item.added"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); + writer.EndObject(); + })); + + events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.content_part.added"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("output_index"); + writer.Uint64(0); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(outputItemId.c_str()); + writer.String("part"); + serializeResponsesPart(writer, ""); + writer.EndObject(); + })); + + responsesStreamingInitialized = true; + + std::stringstream ss; + ss << events.front(); + for (size_t i = 1; i < events.size(); ++i) { + ss << "\n\ndata: " << events[i]; + } + return ss.str(); +} + std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason) { OVMS_PROFILE_FUNCTION(); if (endpoint == Endpoint::RESPONSES) { @@ -1845,59 +2016,11 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str std::vector events; if (!responsesStreamingInitialized) { - events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.created"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.in_progress"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.output_item.added"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("item"); - serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.content_part.added"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("content_index"); - writer.Uint64(0); - writer.String("item_id"); - writer.String(outputItemId.c_str()); - writer.String("part"); - serializeResponsesPart(writer, ""); - writer.EndObject(); - })); - - responsesStreamingInitialized = true; + // Fallback: if init events were not sent earlier, emit them now + std::string initEvents = serializeResponsesStreamingInitEvents(); + if (!initEvents.empty()) { + events.emplace_back(std::move(initEvents)); + } } if (!chunkResponse.empty()) { @@ -1916,9 +2039,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.String(outputItemId.c_str()); writer.String("delta"); writer.String(chunkResponse.c_str()); - writer.String("logprobs"); - writer.StartArray(); - writer.EndArray(); + serializeNotSupportedEmptyArrayField(writer, "logprobs"); writer.EndObject(); })); } @@ -1938,9 +2059,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.String(outputItemId.c_str()); writer.String("text"); writer.String(responsesStreamingOutputText.c_str()); - writer.String("logprobs"); - writer.StartArray(); - writer.EndArray(); + serializeNotSupportedEmptyArrayField(writer, "logprobs"); writer.EndObject(); })); @@ -1961,7 +2080,8 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &outputItemId, finishReason](Writer& writer) { + const char* itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; writer.StartObject(); writer.String("type"); writer.String("response.output_item.done"); @@ -1970,18 +2090,22 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str writer.String("output_index"); writer.Uint64(0); writer.String("item"); - serializeResponsesOutputItem(writer, outputItemId, responsesStreamingOutputText, "completed", true); + serializeResponsesOutputItem(writer, outputItemId, responsesStreamingOutputText, itemStatus, true); writer.EndObject(); })); - events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt, finishReason](Writer& writer) { + const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); + const char* responseStatus = isIncomplete ? "incomplete" : "completed"; + const char* eventType = isIncomplete ? "response.incomplete" : "response.completed"; + const char* incompleteReason = isIncomplete ? "max_tokens" : nullptr; writer.StartObject(); writer.String("type"); - writer.String("response.completed"); + writer.String(eventType); writer.String("sequence_number"); writer.Uint64(responsesStreamingSequenceNumber++); writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, "completed", responsesStreamingOutputText, true); + serializeResponsesResponseObject(writer, responseId, createdAt, responseStatus, responsesStreamingOutputText, true, incompleteReason); writer.EndObject(); })); } @@ -2090,6 +2214,38 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str return buffer.GetString(); } +std::string OpenAIChatCompletionsHandler::serializeResponsesFailedEvent(const std::string& errorMessage, const char* errorCode) { + const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); + const std::string responseId = "resp-" + std::to_string(createdAt); + + std::vector events; + if (!responsesStreamingInitialized) { + std::string initEvents = serializeResponsesStreamingInitEvents(); + if (!initEvents.empty()) { + events.emplace_back(std::move(initEvents)); + } + } + + events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt, &errorMessage, errorCode](Writer& writer) { + writer.StartObject(); + writer.String("type"); + writer.String("response.failed"); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "failed", responsesStreamingOutputText, false, + nullptr, errorMessage.c_str(), errorCode); + writer.EndObject(); + })); + + std::stringstream ss; + ss << events.front(); + for (size_t i = 1; i < events.size(); ++i) { + ss << "\n\ndata: " << events[i]; + } + return ss.str(); +} + std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { OVMS_PROFILE_FUNCTION(); if (endpoint == Endpoint::RESPONSES) { @@ -2106,7 +2262,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { // created: integer; Unix timestamp (in seconds) when the MP graph was created. writer.String("created"); - writer.Int(std::chrono::duration_cast(created.time_since_epoch()).count()); + writer.Int64(std::chrono::duration_cast(created.time_since_epoch()).count()); // model: string; copied from the request writer.String("model"); @@ -2124,11 +2280,11 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingUsageChunk() { writer.String("usage"); writer.StartObject(); // { writer.String("prompt_tokens"); - writer.Int(usage.promptTokens); + writer.Uint64(static_cast(usage.promptTokens)); writer.String("completion_tokens"); - writer.Int(usage.completionTokens); + writer.Uint64(static_cast(usage.completionTokens)); writer.String("total_tokens"); - writer.Int(usage.calculateTotalTokens()); + writer.Uint64(static_cast(usage.calculateTotalTokens())); writer.EndObject(); // } writer.EndObject(); // } diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 352ab709b5..96ac3c50a9 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -71,7 +71,7 @@ class OpenAIChatCompletionsHandler { ov::genai::Tokenizer tokenizer; size_t processedTokens = 0; // tracks overall number of tokens processed by the pipeline bool toolCallsDetectedInStream = false; // tracks whether tool calls were detected in any streaming chunk - size_t responsesStreamingSequenceNumber = 0; + size_t responsesStreamingSequenceNumber = 1; bool responsesStreamingInitialized = false; std::string responsesStreamingOutputText; @@ -91,11 +91,13 @@ class OpenAIChatCompletionsHandler { void serializeResponsesToolChoice(Writer& writer) const; void serializeResponsesTools(Writer& writer) const; void serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, - const char* status, const std::string& fullOutputText, bool includeUsage) const; + const char* status, const std::string& fullOutputText, bool includeUsage, + const char* incompleteReason = nullptr, const char* errorMessage = nullptr, const char* errorCode = nullptr) const; static void serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, const std::string& text, const char* status, bool withContent); static void serializeResponsesPart(Writer& writer, const std::string& text); - std::string serializeResponsesUnaryResponse(const std::vector& parsedOutputs) const; + std::string serializeResponsesUnaryResponse(const std::vector& parsedOutputs, + ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP) const; public: OpenAIChatCompletionsHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, @@ -139,6 +141,8 @@ class OpenAIChatCompletionsHandler { absl::StatusOr> parseToolsToJsonContainer(); absl::StatusOr> parseChatTemplateKwargsToJsonContainer(); const bool areToolsAvailable() const; + const rapidjson::Value* getRawTools() const; + const rapidjson::Value* getRawChatTemplateKwargs() const; std::string serializeUnaryResponse(const std::vector& generationOutputs); std::string serializeUnaryResponse(ov::genai::EncodedResults& results); @@ -146,5 +150,7 @@ class OpenAIChatCompletionsHandler { std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason); std::string serializeStreamingUsageChunk(); std::string serializeStreamingHandshakeChunk(); + std::string serializeResponsesStreamingInitEvents(); + std::string serializeResponsesFailedEvent(const std::string& errorMessage, const char* errorCode = "server_error"); }; } // namespace ovms diff --git a/src/llm/http_llm_calculator.cc b/src/llm/http_llm_calculator.cc index ae6461c61a..ff914864f9 100644 --- a/src/llm/http_llm_calculator.cc +++ b/src/llm/http_llm_calculator.cc @@ -125,6 +125,22 @@ class HttpLLMCalculator : public CalculatorBase { if (status != absl::OkStatus()) return status; SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "LLMCalculator [Node: {}] Pipeline execution scheduled successfully", cc->NodeName()); + + // For RESPONSES streaming, emit init events (response.created, response.in_progress, etc.) + // immediately after scheduling, before blocking on readPartialExecutionResults. + // This reduces perceived latency - the client sees the response is created right away. + if (executionContext->apiHandler->isStream() && executionContext->endpoint == Endpoint::RESPONSES) { + std::string initEvents = executionContext->apiHandler->serializeResponsesStreamingInitEvents(); + if (!initEvents.empty()) { + executionContext->response = wrapTextInServerSideEventMessage(initEvents); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string{std::move(executionContext->response)}, iterationBeginTimestamp); + executionContext->response = ""; + } + cc->Outputs().Tag(LOOPBACK_TAG_NAME).Add(new bool{true}, iterationBeginTimestamp); + auto now = std::chrono::system_clock::now(); + iterationBeginTimestamp = ::mediapipe::Timestamp(std::chrono::duration_cast(now.time_since_epoch()).count()); + return absl::OkStatus(); + } } if (!executionContext->apiHandler->isStream()) { // Unary scenario @@ -160,8 +176,22 @@ class HttpLLMCalculator : public CalculatorBase { cc->Outputs().Tag(LOOPBACK_TAG_NAME).Add(new bool{true}, iterationBeginTimestamp); } } catch (ov::AssertFailure& e) { + if (executionContext->apiHandler && executionContext->apiHandler->isStream() && executionContext->endpoint == Endpoint::RESPONSES) { + std::string failedEvent = executionContext->apiHandler->serializeResponsesFailedEvent(e.what()); + executionContext->response = wrapTextInServerSideEventMessage(failedEvent); + executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string{std::move(executionContext->response)}, iterationBeginTimestamp); + return absl::OkStatus(); + } return absl::InvalidArgumentError(e.what()); } catch (...) { + if (executionContext->apiHandler && executionContext->apiHandler->isStream() && executionContext->endpoint == Endpoint::RESPONSES) { + std::string failedEvent = executionContext->apiHandler->serializeResponsesFailedEvent("Response generation failed"); + executionContext->response = wrapTextInServerSideEventMessage(failedEvent); + executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); + cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string{std::move(executionContext->response)}, iterationBeginTimestamp); + return absl::OkStatus(); + } return absl::InvalidArgumentError("Response generation failed"); } auto now = std::chrono::system_clock::now(); diff --git a/src/llm/py_jinja_template_processor.cpp b/src/llm/py_jinja_template_processor.cpp index 432aa8e722..5a2fd6de49 100644 --- a/src/llm/py_jinja_template_processor.cpp +++ b/src/llm/py_jinja_template_processor.cpp @@ -18,6 +18,8 @@ #include #include +#include "src/port/rapidjson_document.hpp" + #pragma warning(push) #pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456) #pragma GCC diagnostic push @@ -89,4 +91,115 @@ bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templ return false; } +static py::object rapidJsonValueToPyObject(const rapidjson::Value& value) { + if (value.IsNull()) return py::none(); + if (value.IsBool()) return py::bool_(value.GetBool()); + if (value.IsInt()) return py::int_(value.GetInt()); + if (value.IsUint()) return py::int_(value.GetUint()); + if (value.IsInt64()) return py::int_(value.GetInt64()); + if (value.IsUint64()) return py::int_(value.GetUint64()); + if (value.IsDouble()) return py::float_(value.GetDouble()); + if (value.IsString()) return py::str(value.GetString()); + if (value.IsArray()) { + py::list list; + for (const auto& item : value.GetArray()) { + list.append(rapidJsonValueToPyObject(item)); + } + return list; + } + if (value.IsObject()) { + py::dict dict; + for (auto it = value.MemberBegin(); it != value.MemberEnd(); ++it) { + dict[py::str(it->name.GetString())] = rapidJsonValueToPyObject(it->value); + } + return dict; + } + return py::none(); +} + +bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, + ov::genai::ChatHistory& messages, + const rapidjson::Value* tools, + const rapidjson::Value* chatTemplateKwargs, + std::string& output) { + if (templateProcessor.chatTemplate == nullptr) { + output = "Error: Chat template not loaded correctly, so it cannot be applied"; + return false; + } + + py::gil_scoped_acquire acquire; + try { + // Convert ChatHistory to Python list[dict] by extracting known fields + py::list pyMessages; + for (size_t i = 0; i < messages.size(); ++i) { + py::dict pyMsg; + auto role = messages[i]["role"].as_string(); + if (role.has_value()) { + pyMsg[py::str("role")] = py::str(role.value()); + } + auto content = messages[i]["content"].as_string(); + if (content.has_value()) { + pyMsg[py::str("content")] = py::str(content.value()); + } + pyMessages.append(pyMsg); + } + + py::object pyTools = py::none(); + if (tools != nullptr && !tools->IsNull()) { + pyTools = rapidJsonValueToPyObject(*tools); + } + + py::dict pyKwargs; + if (chatTemplateKwargs != nullptr && chatTemplateKwargs->IsObject()) { + for (auto it = chatTemplateKwargs->MemberBegin(); it != chatTemplateKwargs->MemberEnd(); ++it) { + pyKwargs[py::str(it->name.GetString())] = rapidJsonValueToPyObject(it->value); + } + } + + auto locals = py::dict( + "messages"_a = pyMessages, + "chat_template"_a = templateProcessor.chatTemplate->getObject(), + "tool_chat_template"_a = templateProcessor.toolTemplate->getObject(), + "bos_token"_a = templateProcessor.bosToken, + "eos_token"_a = templateProcessor.eosToken, + "tools"_a = pyTools, + "chat_template_kwargs"_a = pyKwargs); + py::exec(R"( + output = "" + error = "" + try: + if chat_template_kwargs is None: + chat_template_kwargs = {} + elif not isinstance(chat_template_kwargs, dict): + raise Exception("chat_template_kwargs must be an object") + + if tools is None: + output = chat_template.render(messages=messages, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs) + else: + output = tool_chat_template.render(messages=messages, tools=tools, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs) + except Exception as e: + error = str(e) + )", + py::globals(), locals); + + std::string result = locals["output"].cast(); + std::string error = locals["error"].cast(); + + if (!error.empty()) { + output = std::move(error); + return false; + } + + output = std::move(result); + return true; + } catch (const pybind11::error_already_set& e) { + LOG(INFO) << "Error occurred when applying chat template: " << e.what(); + output = "Unexpected error occurred when applying chat template"; + } catch (...) { + LOG(INFO) << "Unexpected error occurred when applying chat template"; + output = "Unexpected error occurred when applying chat template"; + } + return false; +} + } // namespace ovms diff --git a/src/llm/py_jinja_template_processor.hpp b/src/llm/py_jinja_template_processor.hpp index 219dd5250c..944cf72f5b 100644 --- a/src/llm/py_jinja_template_processor.hpp +++ b/src/llm/py_jinja_template_processor.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #pragma warning(push) #pragma warning(disable : 6326 28182 6011 28020) @@ -28,6 +29,17 @@ #include "src/python/utils.hpp" +namespace rapidjson { +class CrtAllocator; +template +class MemoryPoolAllocator; +template +class GenericValue; +template +struct UTF8; +using Value = GenericValue, MemoryPoolAllocator>; +} // namespace rapidjson + namespace ovms { class PyJinjaTemplateProcessor { @@ -38,5 +50,10 @@ class PyJinjaTemplateProcessor { std::unique_ptr> toolTemplate = nullptr; static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output); + static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, + ov::genai::ChatHistory& messages, + const rapidjson::Value* tools, + const rapidjson::Value* chatTemplateKwargs, + std::string& output); }; } // namespace ovms diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 3b36aecb94..5dfb2ffbf3 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -209,12 +209,12 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory().size() > 0) { #if (PYTHON_DISABLE == 0) - bool success; - if (executionContext->apiHandler->getProcessedJson().size() > 0) { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - } else { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText); - } + bool success = PyJinjaTemplateProcessor::applyChatTemplate( + getProperties()->templateProcessor, + executionContext->apiHandler->getChatHistory(), + executionContext->apiHandler->getRawTools(), + executionContext->apiHandler->getRawChatTemplateKwargs(), + inputText); if (!success) { return absl::Status(absl::StatusCode::kInvalidArgument, inputText); } diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index e4e150d9e8..73afd381d0 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -521,11 +521,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUser ASSERT_TRUE(chatHistory[0].contains("content")); EXPECT_EQ(chatHistory[0]["role"], "user"); EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); - if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_NE(apiHandler->getProcessedJson().find("\"messages\""), std::string::npos); - } else { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()); - } + EXPECT_TRUE(apiHandler->getProcessedJson().empty()); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { @@ -836,8 +832,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContains std::optional maxModelLength; ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - // Phase 1: Init events emitted even with empty text (before tokenizer produces output) - std::string initChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + // Phase 1: Init events emitted via dedicated method (called right after scheduleExecution in calculator) + std::string initChunk = apiHandler->serializeResponsesStreamingInitEvents(); ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; ASSERT_NE(initChunk.find("\"type\":\"response.in_progress\""), std::string::npos) << initChunk; ASSERT_NE(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << initChunk; @@ -903,6 +899,286 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingUsageChunkForResponsesIsE ASSERT_EQ(apiHandler->serializeStreamingUsageChunk(), ""); } +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesEmitsIncompleteOnLengthFinish) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Init events + apiHandler->serializeResponsesStreamingInitEvents(); + // Delta + apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + + // Final chunk with LENGTH finish reason + std::string finalChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::LENGTH); + + // Should emit response.incomplete instead of response.completed + ASSERT_NE(finalChunk.find("\"type\":\"response.incomplete\""), std::string::npos) << finalChunk; + ASSERT_EQ(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << "Should not contain response.completed: " << finalChunk; + + // Should contain incomplete_details with max_tokens reason + ASSERT_NE(finalChunk.find("\"incomplete_details\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"reason\":\"max_tokens\""), std::string::npos) << finalChunk; + + // Response status should be "incomplete" + ASSERT_NE(finalChunk.find("\"status\":\"incomplete\""), std::string::npos) << finalChunk; + + // Should NOT contain completed_at + // Find the response.incomplete event section and check it doesn't have completed_at + auto incompletePos = finalChunk.find("\"type\":\"response.incomplete\""); + auto responseSection = finalChunk.substr(incompletePos); + ASSERT_EQ(responseSection.find("\"completed_at\""), std::string::npos) << "Incomplete response should not have completed_at: " << responseSection; + + // output_item.done should have status "incomplete" + auto itemDonePos = finalChunk.find("\"type\":\"response.output_item.done\""); + ASSERT_NE(itemDonePos, std::string::npos) << finalChunk; + auto itemSection = finalChunk.substr(itemDonePos); + ASSERT_NE(itemSection.find("\"status\":\"incomplete\""), std::string::npos) << "output_item.done should have incomplete status: " << itemSection; + + // Still should have the other finalization events + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesEmitsCompletedOnStopFinish) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Init events + apiHandler->serializeResponsesStreamingInitEvents(); + // Delta + finish with STOP + std::string finalChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::STOP); + + // Should emit response.completed, NOT response.incomplete + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + ASSERT_EQ(finalChunk.find("\"type\":\"response.incomplete\""), std::string::npos) << "Should not contain response.incomplete: " << finalChunk; + ASSERT_EQ(finalChunk.find("\"incomplete_details\""), std::string::npos) << "Should not contain incomplete_details: " << finalChunk; + + // Response status should be "completed" + ASSERT_NE(finalChunk.find("\"status\":\"completed\""), std::string::npos) << finalChunk; + + // Should contain new spec-aligned fields + ASSERT_NE(finalChunk.find("\"error\":null"), std::string::npos) << "Should contain error:null: " << finalChunk; + ASSERT_NE(finalChunk.find("\"previous_response_id\":null"), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"reasoning\":null"), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"store\":true"), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"truncation\":\"disabled\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"user\":null"), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"metadata\":{}"), std::string::npos) << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeResponsesFailedEventContainsCorrectStructure) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Something went wrong"); + + // Should contain response.failed event type + ASSERT_NE(failedEvent.find("\"type\":\"response.failed\""), std::string::npos) << failedEvent; + // Should NOT contain response.completed or response.incomplete + ASSERT_EQ(failedEvent.find("\"type\":\"response.completed\""), std::string::npos) << failedEvent; + ASSERT_EQ(failedEvent.find("\"type\":\"response.incomplete\""), std::string::npos) << failedEvent; + + // Should contain error object with code and message + ASSERT_NE(failedEvent.find("\"error\":{"), std::string::npos) << "Should contain error object: " << failedEvent; + ASSERT_NE(failedEvent.find("\"code\":\"server_error\""), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"message\":\"Something went wrong\""), std::string::npos) << failedEvent; + + // Response status should be "failed" + ASSERT_NE(failedEvent.find("\"status\":\"failed\""), std::string::npos) << failedEvent; + + // Should include init events since they were not emitted before + ASSERT_NE(failedEvent.find("\"type\":\"response.created\""), std::string::npos) << failedEvent; + + // Should contain sequence_number + ASSERT_NE(failedEvent.find("\"sequence_number\""), std::string::npos) << failedEvent; + + // Should NOT contain completed_at + auto failedPos = failedEvent.find("\"type\":\"response.failed\""); + auto responseSection = failedEvent.substr(failedPos); + ASSERT_EQ(responseSection.find("\"completed_at\""), std::string::npos) << "Failed response should not have completed_at: " << responseSection; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeResponsesFailedEventWithCustomErrorCode) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Invalid prompt content", "invalid_prompt"); + + ASSERT_NE(failedEvent.find("\"code\":\"invalid_prompt\""), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"message\":\"Invalid prompt content\""), std::string::npos) << failedEvent; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeResponsesFailedEventAfterPartialStreaming) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Emit init events and some deltas first + apiHandler->serializeResponsesStreamingInitEvents(); + apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + + // Then fail + std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Generation aborted"); + + // Should contain response.failed but NOT init events (already sent) + ASSERT_NE(failedEvent.find("\"type\":\"response.failed\""), std::string::npos) << failedEvent; + ASSERT_EQ(failedEvent.find("\"type\":\"response.created\""), std::string::npos) << "Should not re-emit init events: " << failedEvent; + + // Error should be present + ASSERT_NE(failedEvent.find("\"error\":{"), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"code\":\"server_error\""), std::string::npos) << failedEvent; + ASSERT_NE(failedEvent.find("\"message\":\"Generation aborted\""), std::string::npos) << failedEvent; + + // Should NOT contain usage (failed responses don't include usage) + ASSERT_EQ(failedEvent.find("\"usage\""), std::string::npos) << "Failed response should not include usage: " << failedEvent; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesIncompleteOnLength) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::GenerationOutput genOutput; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + genOutput.generated_ids = std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1]); + genOutput.finish_reason = ov::genai::GenerationFinishReason::LENGTH; + + std::vector generationOutputs = {genOutput}; + std::string serialized = apiHandler->serializeUnaryResponse(generationOutputs); + + // Should have status "incomplete" + ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized; + // Should have incomplete_details with reason + ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized; + // Should NOT have completed_at + ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized; + // Should NOT have status "completed" + ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; + + // Should contain new spec-aligned fields + ASSERT_NE(serialized.find("\"error\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"previous_response_id\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"reasoning\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"store\":true"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"truncation\":\"disabled\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"user\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesCompletedOnStop) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::GenerationOutput genOutput; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + genOutput.generated_ids = std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1]); + genOutput.finish_reason = ov::genai::GenerationFinishReason::STOP; + + std::vector generationOutputs = {genOutput}; + std::string serialized = apiHandler->serializeUnaryResponse(generationOutputs); + + // Should have status "completed" + ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; + // Should have completed_at + ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized; + // Should NOT have incomplete_details + ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; + + // Should contain new spec-aligned fields + ASSERT_NE(serialized.find("\"error\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"previous_response_id\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"reasoning\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"store\":true"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"truncation\":\"disabled\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"user\":null"), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized; +} + TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { std::string json = R"({ "model": "llama", @@ -1728,7 +2004,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNStreamingIsRejected) { uint32_t bestOfLimit = 0; std::optional maxModelLength; std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("n greater than 1 is not supported for responses streaming")); + EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::InvalidArgumentError("n greater than 1 is not supported for Responses API streaming")); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesNUnaryIsAccepted) { diff --git a/src/test/llm/llmtemplate_test.cpp b/src/test/llm/llmtemplate_test.cpp index 8cdf565cba..10475e204f 100644 --- a/src/test/llm/llmtemplate_test.cpp +++ b/src/test/llm/llmtemplate_test.cpp @@ -232,6 +232,64 @@ TEST_F(LLMChatTemplateTest, ChatTemplateJinjaException) { ASSERT_EQ(finalPrompt, errorOutput); } +TEST_F(LLMChatTemplateTest, ChatTemplateFromChatHistorySingleMessage) { + CopyDefaultChatTemplate(); + LoadTemplateProcessor(); + std::string finalPrompt = ""; + ov::genai::ChatHistory chatHistory; + chatHistory.push_back({{"role", "user"}, {"content", "How can I help you?"}}); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( + servable->getProperties()->templateProcessor, + chatHistory, nullptr, nullptr, finalPrompt), + true); + std::string expectedOutput = "User: How can I help you?"; + ASSERT_EQ(finalPrompt, expectedOutput); +} + +TEST_F(LLMChatTemplateTest, ChatTemplateFromChatHistoryMultiMessage) { + CopyDefaultChatTemplate(); + LoadTemplateProcessor(); + std::string finalPrompt = ""; + ov::genai::ChatHistory chatHistory; + chatHistory.push_back({{"role", "user"}, {"content", "How can I help you?"}}); + chatHistory.push_back({{"role", "user"}, {"content", "2How can I help you?"}}); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( + servable->getProperties()->templateProcessor, + chatHistory, nullptr, nullptr, finalPrompt), + true); + std::string expectedOutput = "User: How can I help you?User: 2How can I help you?"; + ASSERT_EQ(finalPrompt, expectedOutput); +} + +TEST_F(LLMChatTemplateTest, ChatTemplateFromChatHistoryMatchesJsonOverload) { + CopyDefaultChatTemplate(); + LoadTemplateProcessor(); + + // Apply template via JSON overload + std::string jsonPrompt = ""; + std::string payloadBody = R"( + { + "messages": [{ "role": "user", "content": "hello" }] + } + )"; + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( + servable->getProperties()->templateProcessor, + servable->getProperties()->modelsPath, payloadBody, jsonPrompt), + true); + + // Apply template via ChatHistory overload + std::string chatHistoryPrompt = ""; + ov::genai::ChatHistory chatHistory; + chatHistory.push_back({{"role", "user"}, {"content", "hello"}}); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( + servable->getProperties()->templateProcessor, + chatHistory, nullptr, nullptr, chatHistoryPrompt), + true); + + // Both overloads must produce the same result + ASSERT_EQ(jsonPrompt, chatHistoryPrompt); +} + TEST_F(LLMChatTemplateTest, ChatTemplateComparePythonAndGenAiProcessors) { GTEST_SKIP() << "Skipping test due to GenAI template processor not being able to compare values of different types (no implicit conversion). Enable when resolved."; // Using modified Llama2 template to work with limited tokenizer object (with no models loaded) From 8b0152104aa06349c4c3bf4cca97d51c9ed2473b Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 20 Mar 2026 10:09:36 +0100 Subject: [PATCH 13/24] style --- src/llm/apis/openai_completions.cpp | 4 ++-- src/llm/py_jinja_template_processor.cpp | 24 ++++++++++++++++-------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index f879d39195..77d7c3ee88 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1974,7 +1974,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents( writer.String("sequence_number"); writer.Uint64(responsesStreamingSequenceNumber++); writer.String("output_index"); - writer.Uint64(0); + writer.Uint64(0); writer.String("item"); serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); writer.EndObject(); @@ -1987,7 +1987,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents( writer.String("sequence_number"); writer.Uint64(responsesStreamingSequenceNumber++); writer.String("output_index"); - writer.Uint64(0); + writer.Uint64(0); writer.String("content_index"); writer.Uint64(0); writer.String("item_id"); diff --git a/src/llm/py_jinja_template_processor.cpp b/src/llm/py_jinja_template_processor.cpp index 5a2fd6de49..45d317e290 100644 --- a/src/llm/py_jinja_template_processor.cpp +++ b/src/llm/py_jinja_template_processor.cpp @@ -92,14 +92,22 @@ bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templ } static py::object rapidJsonValueToPyObject(const rapidjson::Value& value) { - if (value.IsNull()) return py::none(); - if (value.IsBool()) return py::bool_(value.GetBool()); - if (value.IsInt()) return py::int_(value.GetInt()); - if (value.IsUint()) return py::int_(value.GetUint()); - if (value.IsInt64()) return py::int_(value.GetInt64()); - if (value.IsUint64()) return py::int_(value.GetUint64()); - if (value.IsDouble()) return py::float_(value.GetDouble()); - if (value.IsString()) return py::str(value.GetString()); + if (value.IsNull()) + return py::none(); + if (value.IsBool()) + return py::bool_(value.GetBool()); + if (value.IsInt()) + return py::int_(value.GetInt()); + if (value.IsUint()) + return py::int_(value.GetUint()); + if (value.IsInt64()) + return py::int_(value.GetInt64()); + if (value.IsUint64()) + return py::int_(value.GetUint64()); + if (value.IsDouble()) + return py::float_(value.GetDouble()); + if (value.IsString()) + return py::str(value.GetString()); if (value.IsArray()) { py::list list; for (const auto& item : value.GetArray()) { From d8d715781be53bd98a7545afc8ff9995ddc60397 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 20 Mar 2026 15:32:35 +0100 Subject: [PATCH 14/24] fix --- src/llm/BUILD | 1 - src/llm/apis/openai_completions.cpp | 65 +++++++++---- src/llm/apis/openai_completions.hpp | 3 - src/llm/py_jinja_template_processor.cpp | 121 ------------------------ src/llm/py_jinja_template_processor.hpp | 18 ---- src/llm/servable.cpp | 7 +- 6 files changed, 48 insertions(+), 167 deletions(-) diff --git a/src/llm/BUILD b/src/llm/BUILD index bc41306ed7..ae37d936ca 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -252,7 +252,6 @@ ovms_cc_library( "//third_party:openvino", "//src:libovmslogging", "//src/python:utils", - "//src/port:rapidjson_document", ] + PYBIND_DEPS, visibility = ["//visibility:public"], additional_copts = COPTS_PYTHON diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 77d7c3ee88..dae26bda66 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -583,7 +583,7 @@ absl::Status OpenAIChatCompletionsHandler::ensureArgumentsInToolCalls(Value& mes return absl::OkStatus(); } -absl::Status OpenAIChatCompletionsHandler::parseResponsesInputDirectly(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { +absl::Status OpenAIChatCompletionsHandler::parseResponsesInput(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains) { auto inputIt = doc.FindMember("input"); if (inputIt == doc.MemberEnd()) { return absl::InvalidArgumentError("input missing in request"); @@ -1099,22 +1099,6 @@ const bool OpenAIChatCompletionsHandler::areToolsAvailable() const { return !request.toolNameSchemaMap.empty(); } -const rapidjson::Value* OpenAIChatCompletionsHandler::getRawTools() const { - auto it = doc.FindMember("tools"); - if (it == doc.MemberEnd() || it->value.IsNull()) { - return nullptr; - } - return &it->value; -} - -const rapidjson::Value* OpenAIChatCompletionsHandler::getRawChatTemplateKwargs() const { - auto it = doc.FindMember("chat_template_kwargs"); - if (it == doc.MemberEnd() || it->value.IsNull()) { - return nullptr; - } - return &it->value; -} - const OpenAIChatCompletionsRequest& OpenAIChatCompletionsHandler::getRequest() const { return request; } @@ -1237,11 +1221,56 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optionalvalue.IsNull()) { + Value toolsCopy(toolsIt->value, alloc); + processedDoc.AddMember("tools", toolsCopy, alloc); + } + + // Copy chat_template_kwargs from original doc if present + auto kwargsIt = doc.FindMember("chat_template_kwargs"); + if (kwargsIt != doc.MemberEnd() && !kwargsIt->value.IsNull()) { + Value kwargsCopy(kwargsIt->value, alloc); + processedDoc.AddMember("chat_template_kwargs", kwargsCopy, alloc); + } + + StringBuffer buffer; + Writer writer(buffer); + processedDoc.Accept(writer); + request.processedJson = buffer.GetString(); + } +#endif // logprobs: bool; optional - defaults to false it = doc.FindMember("logprobs"); if (it != doc.MemberEnd() && !it->value.IsNull()) { diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 96ac3c50a9..5016590f0b 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -81,7 +81,6 @@ class OpenAIChatCompletionsHandler { absl::Status parseCompletionsPart(); absl::Status parseChatCompletionsPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); - absl::Status parseResponsesInputDirectly(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); @@ -141,8 +140,6 @@ class OpenAIChatCompletionsHandler { absl::StatusOr> parseToolsToJsonContainer(); absl::StatusOr> parseChatTemplateKwargsToJsonContainer(); const bool areToolsAvailable() const; - const rapidjson::Value* getRawTools() const; - const rapidjson::Value* getRawChatTemplateKwargs() const; std::string serializeUnaryResponse(const std::vector& generationOutputs); std::string serializeUnaryResponse(ov::genai::EncodedResults& results); diff --git a/src/llm/py_jinja_template_processor.cpp b/src/llm/py_jinja_template_processor.cpp index 45d317e290..432aa8e722 100644 --- a/src/llm/py_jinja_template_processor.cpp +++ b/src/llm/py_jinja_template_processor.cpp @@ -18,8 +18,6 @@ #include #include -#include "src/port/rapidjson_document.hpp" - #pragma warning(push) #pragma warning(disable : 4005 4309 6001 6385 6386 6326 6011 6246 4456) #pragma GCC diagnostic push @@ -91,123 +89,4 @@ bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templ return false; } -static py::object rapidJsonValueToPyObject(const rapidjson::Value& value) { - if (value.IsNull()) - return py::none(); - if (value.IsBool()) - return py::bool_(value.GetBool()); - if (value.IsInt()) - return py::int_(value.GetInt()); - if (value.IsUint()) - return py::int_(value.GetUint()); - if (value.IsInt64()) - return py::int_(value.GetInt64()); - if (value.IsUint64()) - return py::int_(value.GetUint64()); - if (value.IsDouble()) - return py::float_(value.GetDouble()); - if (value.IsString()) - return py::str(value.GetString()); - if (value.IsArray()) { - py::list list; - for (const auto& item : value.GetArray()) { - list.append(rapidJsonValueToPyObject(item)); - } - return list; - } - if (value.IsObject()) { - py::dict dict; - for (auto it = value.MemberBegin(); it != value.MemberEnd(); ++it) { - dict[py::str(it->name.GetString())] = rapidJsonValueToPyObject(it->value); - } - return dict; - } - return py::none(); -} - -bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, - ov::genai::ChatHistory& messages, - const rapidjson::Value* tools, - const rapidjson::Value* chatTemplateKwargs, - std::string& output) { - if (templateProcessor.chatTemplate == nullptr) { - output = "Error: Chat template not loaded correctly, so it cannot be applied"; - return false; - } - - py::gil_scoped_acquire acquire; - try { - // Convert ChatHistory to Python list[dict] by extracting known fields - py::list pyMessages; - for (size_t i = 0; i < messages.size(); ++i) { - py::dict pyMsg; - auto role = messages[i]["role"].as_string(); - if (role.has_value()) { - pyMsg[py::str("role")] = py::str(role.value()); - } - auto content = messages[i]["content"].as_string(); - if (content.has_value()) { - pyMsg[py::str("content")] = py::str(content.value()); - } - pyMessages.append(pyMsg); - } - - py::object pyTools = py::none(); - if (tools != nullptr && !tools->IsNull()) { - pyTools = rapidJsonValueToPyObject(*tools); - } - - py::dict pyKwargs; - if (chatTemplateKwargs != nullptr && chatTemplateKwargs->IsObject()) { - for (auto it = chatTemplateKwargs->MemberBegin(); it != chatTemplateKwargs->MemberEnd(); ++it) { - pyKwargs[py::str(it->name.GetString())] = rapidJsonValueToPyObject(it->value); - } - } - - auto locals = py::dict( - "messages"_a = pyMessages, - "chat_template"_a = templateProcessor.chatTemplate->getObject(), - "tool_chat_template"_a = templateProcessor.toolTemplate->getObject(), - "bos_token"_a = templateProcessor.bosToken, - "eos_token"_a = templateProcessor.eosToken, - "tools"_a = pyTools, - "chat_template_kwargs"_a = pyKwargs); - py::exec(R"( - output = "" - error = "" - try: - if chat_template_kwargs is None: - chat_template_kwargs = {} - elif not isinstance(chat_template_kwargs, dict): - raise Exception("chat_template_kwargs must be an object") - - if tools is None: - output = chat_template.render(messages=messages, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs) - else: - output = tool_chat_template.render(messages=messages, tools=tools, bos_token=bos_token, eos_token=eos_token, add_generation_prompt=True, **chat_template_kwargs) - except Exception as e: - error = str(e) - )", - py::globals(), locals); - - std::string result = locals["output"].cast(); - std::string error = locals["error"].cast(); - - if (!error.empty()) { - output = std::move(error); - return false; - } - - output = std::move(result); - return true; - } catch (const pybind11::error_already_set& e) { - LOG(INFO) << "Error occurred when applying chat template: " << e.what(); - output = "Unexpected error occurred when applying chat template"; - } catch (...) { - LOG(INFO) << "Unexpected error occurred when applying chat template"; - output = "Unexpected error occurred when applying chat template"; - } - return false; -} - } // namespace ovms diff --git a/src/llm/py_jinja_template_processor.hpp b/src/llm/py_jinja_template_processor.hpp index 944cf72f5b..95b9e8598b 100644 --- a/src/llm/py_jinja_template_processor.hpp +++ b/src/llm/py_jinja_template_processor.hpp @@ -18,8 +18,6 @@ #include #include -#include -#include #pragma warning(push) #pragma warning(disable : 6326 28182 6011 28020) // Python execution for template processing @@ -29,17 +27,6 @@ #include "src/python/utils.hpp" -namespace rapidjson { -class CrtAllocator; -template -class MemoryPoolAllocator; -template -class GenericValue; -template -struct UTF8; -using Value = GenericValue, MemoryPoolAllocator>; -} // namespace rapidjson - namespace ovms { class PyJinjaTemplateProcessor { @@ -50,10 +37,5 @@ class PyJinjaTemplateProcessor { std::unique_ptr> toolTemplate = nullptr; static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output); - static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, - ov::genai::ChatHistory& messages, - const rapidjson::Value* tools, - const rapidjson::Value* chatTemplateKwargs, - std::string& output); }; } // namespace ovms diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 5dfb2ffbf3..725967ceac 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -209,12 +209,7 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getChatHistory().size() > 0) { #if (PYTHON_DISABLE == 0) - bool success = PyJinjaTemplateProcessor::applyChatTemplate( - getProperties()->templateProcessor, - executionContext->apiHandler->getChatHistory(), - executionContext->apiHandler->getRawTools(), - executionContext->apiHandler->getRawChatTemplateKwargs(), - inputText); + bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); if (!success) { return absl::Status(absl::StatusCode::kInvalidArgument, inputText); } From ae633753e48a806e6f79bf792fa7bd39b31aa595 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 20 Mar 2026 15:34:36 +0100 Subject: [PATCH 15/24] style --- src/llm/apis/openai_completions.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index dae26bda66..15cdbd19b3 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1226,7 +1226,6 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optional Date: Fri, 20 Mar 2026 16:42:10 +0100 Subject: [PATCH 16/24] fix --- src/llm/apis/openai_completions.cpp | 1 - src/llm/apis/openai_completions.hpp | 1 + src/test/http_openai_handler_test.cpp | 90 ++++++++++++++++++++++++++- src/test/llm/llmtemplate_test.cpp | 58 ----------------- 4 files changed, 90 insertions(+), 60 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 15cdbd19b3..c8778aed43 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1873,7 +1873,6 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); if (endpoint == Endpoint::RESPONSES) { std::vector parsedOutputs; - usage.completionTokens = 0; for (const std::string& text : results.texts) { auto result = tokenizer.encode(text); auto& input_ids = result.input_ids; diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 5016590f0b..50d11225d6 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -81,6 +81,7 @@ class OpenAIChatCompletionsHandler { absl::Status parseCompletionsPart(); absl::Status parseChatCompletionsPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseResponsesPart(std::optional maxTokensLimit, std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); + absl::Status parseResponsesInput(std::optional allowedLocalMediaPath, std::optional> allowedMediaDomains); absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 73afd381d0..1b6945281b 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -521,7 +521,95 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUser ASSERT_TRUE(chatHistory[0].contains("content")); EXPECT_EQ(chatHistory[0]["role"], "user"); EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); - EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + if (endpoint() == ovms::Endpoint::CHAT_COMPLETIONS) { + // Chat completions with simple text does not mutate the JSON, so processedJson is empty + EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquivalentMessages) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + + // For Responses, processedJson is always built from chatHistory. + // For chat/completions with simple text, processedJson is empty (original body is used instead). + // In both cases, the chatHistory should be equivalent. + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1); + EXPECT_EQ(chatHistory[0]["role"], "user"); + EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + // Responses path builds processedJson with messages array + const std::string& processedJson = apiHandler->getProcessedJson(); + ASSERT_FALSE(processedJson.empty()) << "Responses should build processedJson"; + // Verify it contains a messages array with the correct content + rapidjson::Document processedDoc; + processedDoc.Parse(processedJson.c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_TRUE(processedDoc["messages"].IsArray()); + ASSERT_EQ(processedDoc["messages"].Size(), 1u); + EXPECT_STREQ(processedDoc["messages"][0]["role"].GetString(), "user"); + EXPECT_STREQ(processedDoc["messages"][0]["content"].GetString(), "What is OpenVINO?"); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonEquivalentMultiMessage) { + // Test with array input containing multiple messages + std::string json; + if (endpoint() == ovms::Endpoint::RESPONSES) { + json = R"({"model":"llama","input":[ + {"role":"system","content":"You are helpful."}, + {"role":"user","content":"Hello"} + ]})"; + } else { + json = R"({"model":"llama","messages":[ + {"role":"system","content":"You are helpful."}, + {"role":"user","content":"Hello"} + ]})"; + } + auto apiHandler = parseCurrentRequest(json); + + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 2); + EXPECT_EQ(chatHistory[0]["role"], "system"); + EXPECT_EQ(chatHistory[0]["content"], "You are helpful."); + EXPECT_EQ(chatHistory[1]["role"], "user"); + EXPECT_EQ(chatHistory[1]["content"], "Hello"); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + const std::string& processedJson = apiHandler->getProcessedJson(); + ASSERT_FALSE(processedJson.empty()); + rapidjson::Document processedDoc; + processedDoc.Parse(processedJson.c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_EQ(processedDoc["messages"].Size(), 2u); + EXPECT_STREQ(processedDoc["messages"][0]["role"].GetString(), "system"); + EXPECT_STREQ(processedDoc["messages"][0]["content"].GetString(), "You are helpful."); + EXPECT_STREQ(processedDoc["messages"][1]["role"].GetString(), "user"); + EXPECT_STREQ(processedDoc["messages"][1]["content"].GetString(), "Hello"); + } +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonIncludesToolsWhenPresent) { + std::string json = createToolRequest("\"auto\""); + auto apiHandler = parseCurrentRequest(json); + + EXPECT_TRUE(apiHandler->areToolsAvailable()); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + const std::string& processedJson = apiHandler->getProcessedJson(); + ASSERT_FALSE(processedJson.empty()); + rapidjson::Document processedDoc; + processedDoc.Parse(processedJson.c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_TRUE(processedDoc.HasMember("tools")); + ASSERT_TRUE(processedDoc["tools"].IsArray()); + ASSERT_GT(processedDoc["tools"].Size(), 0u); + } } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { diff --git a/src/test/llm/llmtemplate_test.cpp b/src/test/llm/llmtemplate_test.cpp index 10475e204f..8cdf565cba 100644 --- a/src/test/llm/llmtemplate_test.cpp +++ b/src/test/llm/llmtemplate_test.cpp @@ -232,64 +232,6 @@ TEST_F(LLMChatTemplateTest, ChatTemplateJinjaException) { ASSERT_EQ(finalPrompt, errorOutput); } -TEST_F(LLMChatTemplateTest, ChatTemplateFromChatHistorySingleMessage) { - CopyDefaultChatTemplate(); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - ov::genai::ChatHistory chatHistory; - chatHistory.push_back({{"role", "user"}, {"content", "How can I help you?"}}); - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( - servable->getProperties()->templateProcessor, - chatHistory, nullptr, nullptr, finalPrompt), - true); - std::string expectedOutput = "User: How can I help you?"; - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateFromChatHistoryMultiMessage) { - CopyDefaultChatTemplate(); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - ov::genai::ChatHistory chatHistory; - chatHistory.push_back({{"role", "user"}, {"content", "How can I help you?"}}); - chatHistory.push_back({{"role", "user"}, {"content", "2How can I help you?"}}); - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( - servable->getProperties()->templateProcessor, - chatHistory, nullptr, nullptr, finalPrompt), - true); - std::string expectedOutput = "User: How can I help you?User: 2How can I help you?"; - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateFromChatHistoryMatchesJsonOverload) { - CopyDefaultChatTemplate(); - LoadTemplateProcessor(); - - // Apply template via JSON overload - std::string jsonPrompt = ""; - std::string payloadBody = R"( - { - "messages": [{ "role": "user", "content": "hello" }] - } - )"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( - servable->getProperties()->templateProcessor, - servable->getProperties()->modelsPath, payloadBody, jsonPrompt), - true); - - // Apply template via ChatHistory overload - std::string chatHistoryPrompt = ""; - ov::genai::ChatHistory chatHistory; - chatHistory.push_back({{"role", "user"}, {"content", "hello"}}); - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate( - servable->getProperties()->templateProcessor, - chatHistory, nullptr, nullptr, chatHistoryPrompt), - true); - - // Both overloads must produce the same result - ASSERT_EQ(jsonPrompt, chatHistoryPrompt); -} - TEST_F(LLMChatTemplateTest, ChatTemplateComparePythonAndGenAiProcessors) { GTEST_SKIP() << "Skipping test due to GenAI template processor not being able to compare values of different types (no implicit conversion). Enable when resolved."; // Using modified Llama2 template to work with limited tokenizer object (with no models loaded) From 2ab03f3218f31ed838de325212e20311fefee6f8 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Sun, 22 Mar 2026 11:13:38 +0100 Subject: [PATCH 17/24] style --- src/drogon_http_server.cpp | 3 +- src/llm/apis/openai_completions.cpp | 607 +++++++++++++++++--------- src/llm/apis/openai_completions.hpp | 29 ++ src/test/http_openai_handler_test.cpp | 177 +++++++- 4 files changed, 605 insertions(+), 211 deletions(-) diff --git a/src/drogon_http_server.cpp b/src/drogon_http_server.cpp index 1c14d5d57b..210776ac8b 100644 --- a/src/drogon_http_server.cpp +++ b/src/drogon_http_server.cpp @@ -88,7 +88,8 @@ Status DrogonHttpServer::startAcceptingRequests() { drogon::app().disableSigtermHandling(); drogon::app().setDefaultHandler([this](const drogon::HttpRequestPtr& req, std::function&& drogonResponseInitializeCallback) { - bool isTextGeneration = req->path().find("/completions") != std::string::npos; + bool isTextGeneration = req->path().find("/completions") != std::string::npos || + req->path().find("/responses") != std::string::npos; // Here we need to schedule the request to the separate thread pool // in order to use disconnection callback of drogon. diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index c8778aed43..0d6b879359 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -17,7 +17,6 @@ #include "openai_completions.hpp" #include -#include #include #include #include @@ -96,29 +95,6 @@ ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& v throw std::invalid_argument("Unsupported JSON value type"); } -std::string serializeResponsesEvent(const std::function&)>& eventSerializer) { - StringBuffer eventBuffer; - Writer eventWriter(eventBuffer); - eventSerializer(eventWriter); - return std::string(eventBuffer.GetString()); -} - -void serializeNotSupportedNullField(Writer& writer, const char* fieldName) { - writer.String(fieldName); - writer.Null(); -} - -void serializeNotSupportedZeroField(Writer& writer, const char* fieldName) { - writer.String(fieldName); - writer.Uint64(0); -} - -void serializeNotSupportedEmptyArrayField(Writer& writer, const char* fieldName) { - writer.String(fieldName); - writer.StartArray(); - writer.EndArray(); -} - } // namespace void OpenAIChatCompletionsHandler::serializeResponsesToolChoice(Writer& writer) const { @@ -193,8 +169,7 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(usage.promptTokens)); - writer.String("input_tokens_details"); - writer.StartObject(); - serializeNotSupportedZeroField(writer, "cached_tokens"); - writer.EndObject(); + // TODO: input_tokens_details.cached_tokens not supported writer.String("output_tokens"); writer.Uint64(static_cast(usage.completionTokens)); - writer.String("output_tokens_details"); - writer.StartObject(); - serializeNotSupportedZeroField(writer, "reasoning_tokens"); - writer.EndObject(); + // TODO: output_tokens_details.reasoning_tokens not supported writer.String("total_tokens"); writer.Uint64(static_cast(usage.calculateTotalTokens())); writer.EndObject(); @@ -341,7 +328,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.String("max_tokens"); writer.EndObject(); } - serializeNotSupportedNullField(writer, "error"); + // TODO: error not supported in unary response writer.String("model"); writer.String(request.model.c_str()); writer.String("status"); @@ -349,8 +336,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.String("parallel_tool_calls"); writer.Bool(false); - serializeNotSupportedNullField(writer, "previous_response_id"); - serializeNotSupportedNullField(writer, "reasoning"); + // TODO: previous_response_id not supported writer.String("store"); writer.Bool(true); writer.String("temperature"); @@ -377,7 +363,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const } writer.String("truncation"); writer.String("disabled"); - serializeNotSupportedNullField(writer, "user"); + // TODO: user not supported writer.String("metadata"); writer.StartObject(); writer.EndObject(); @@ -391,22 +377,67 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.StartArray(); int outputIndex = 0; for (const auto& parsedOutput : parsedOutputs) { - const std::string outputId = "msg-" + std::to_string(outputIndex++); + // Emit reasoning output item if reasoning is available + if (!parsedOutput.reasoning.empty()) { + const std::string reasoningId = "rs-" + std::to_string(outputIndex); + writer.StartObject(); + writer.String("id"); + writer.String(reasoningId.c_str()); + writer.String("type"); + writer.String("reasoning"); + writer.String("summary"); + writer.StartArray(); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(parsedOutput.reasoning.c_str()); + writer.EndObject(); + writer.EndArray(); + writer.EndObject(); + } - writer.StartObject(); - writer.String("id"); - writer.String(outputId.c_str()); - writer.String("type"); - writer.String("message"); - writer.String("role"); - writer.String("assistant"); - writer.String("status"); - writer.String(responseStatus); - writer.String("content"); - writer.StartArray(); - serializeResponsesPart(writer, parsedOutput.content); - writer.EndArray(); - writer.EndObject(); + if (!parsedOutput.toolCalls.empty()) { + // Emit function_call output items for each tool call + for (const auto& toolCall : parsedOutput.toolCalls) { + writer.StartObject(); + writer.String("id"); + writer.String(toolCall.id.c_str()); + writer.String("type"); + writer.String("function_call"); + writer.String("status"); + writer.String(responseStatus); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("name"); + writer.String(toolCall.name.c_str()); + writer.String("arguments"); + writer.String(toolCall.arguments.c_str()); + writer.EndObject(); + } + } + + // Emit message output item if there is non-empty content or no tool calls + if (!parsedOutput.content.empty() || parsedOutput.toolCalls.empty()) { + const std::string outputId = "msg-" + std::to_string(outputIndex); + + writer.StartObject(); + writer.String("id"); + writer.String(outputId.c_str()); + writer.String("type"); + writer.String("message"); + writer.String("role"); + writer.String("assistant"); + writer.String("status"); + writer.String(responseStatus); + writer.String("content"); + writer.StartArray(); + serializeResponsesPart(writer, parsedOutput.content); + writer.EndArray(); + writer.EndObject(); + } + + outputIndex++; } writer.EndArray(); @@ -414,16 +445,10 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.StartObject(); writer.String("input_tokens"); writer.Uint64(static_cast(usage.promptTokens)); - writer.String("input_tokens_details"); - writer.StartObject(); - serializeNotSupportedZeroField(writer, "cached_tokens"); - writer.EndObject(); + // TODO: input_tokens_details.cached_tokens not supported writer.String("output_tokens"); writer.Uint64(static_cast(usage.completionTokens)); - writer.String("output_tokens_details"); - writer.StartObject(); - serializeNotSupportedZeroField(writer, "reasoning_tokens"); - writer.EndObject(); + // TODO: output_tokens_details.reasoning_tokens not supported writer.String("total_tokens"); writer.Uint64(static_cast(usage.calculateTotalTokens())); writer.EndObject(); @@ -1654,7 +1679,7 @@ static bool hasToolCallsInStreamingDelta(const rapidjson::Document& delta) { ParsedOutput OpenAIChatCompletionsHandler::parseOutputIfNeeded(const std::vector& generatedIds) { OVMS_PROFILE_FUNCTION(); ParsedOutput parsedOutput; - if (endpoint != Endpoint::CHAT_COMPLETIONS || outputParser == nullptr) { + if ((endpoint != Endpoint::CHAT_COMPLETIONS && endpoint != Endpoint::RESPONSES) || outputParser == nullptr) { parsedOutput.content = this->tokenizer.decode(generatedIds); } else { parsedOutput = outputParser->parse(generatedIds, this->areToolsAvailable()); @@ -1965,6 +1990,248 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD return jsonResponse.ToString(); } +void OpenAIChatCompletionsHandler::writeEventHeader(Writer& writer, const char* eventType) { + writer.StartObject(); + writer.String("type"); + writer.String(eventType); + writer.String("sequence_number"); + writer.Uint64(responsesStreamingSequenceNumber++); +} + +void OpenAIChatCompletionsHandler::writeContentLocation(Writer& writer, const std::string& itemId, uint64_t outputIndex) { + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("content_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(itemId.c_str()); +} + +void OpenAIChatCompletionsHandler::writeReasoningLocation(Writer& writer, const std::string& itemId) { + writer.String("output_index"); + writer.Uint64(0); + writer.String("summary_index"); + writer.Uint64(0); + writer.String("item_id"); + writer.String(itemId.c_str()); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseCreatedEvent(const std::string& responseId, int64_t createdAt) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.created"); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseInProgressEvent(const std::string& responseId, int64_t createdAt) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.in_progress"); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputItemAddedEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.added"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeContentPartAddedEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.content_part.added"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("part"); + serializeResponsesPart(writer, ""); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputTextDeltaEvent(const std::string& outputItemId, const std::string& delta, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_text.delta"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("delta"); + writer.String(delta.c_str()); + // TODO: logprobs not supported + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputTextDoneEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_text.done"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("text"); + writer.String(responsesStreamingOutputText.c_str()); + // TODO: logprobs not supported + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeContentPartDoneEvent(const std::string& outputItemId, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.content_part.done"); + writeContentLocation(writer, outputItemId, outputIndex); + writer.String("part"); + serializeResponsesPart(writer, responsesStreamingOutputText); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeOutputItemDoneEvent(const std::string& outputItemId, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex) { + const char* itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.done"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + serializeResponsesOutputItem(writer, outputItemId, responsesStreamingOutputText, itemStatus, true); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseCompletedEvent(const std::string& responseId, int64_t createdAt, ov::genai::GenerationFinishReason finishReason) { + const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); + const char* responseStatus = isIncomplete ? "incomplete" : "completed"; + const char* eventType = isIncomplete ? "response.incomplete" : "response.completed"; + const char* incompleteReason = isIncomplete ? "max_tokens" : nullptr; + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, eventType); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, responseStatus, responsesStreamingOutputText, true, incompleteReason); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, const char* errorCode) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.failed"); + writer.String("response"); + serializeResponsesResponseObject(writer, responseId, createdAt, "failed", responsesStreamingOutputText, false, + nullptr, errorMessage.c_str(), errorCode); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningOutputItemAddedEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.added"); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(reasoningItemId.c_str()); + writer.String("type"); + writer.String("reasoning"); + writer.String("summary"); + writer.StartArray(); + writer.EndArray(); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryPartAddedEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_part.added"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("part"); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(""); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryTextDeltaEvent(const std::string& reasoningItemId, const std::string& delta) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_text.delta"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("delta"); + writer.String(delta.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryTextDoneEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_text.done"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("text"); + writer.String(responsesStreamingReasoningText.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryPartDoneEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.reasoning_summary_part.done"); + writeReasoningLocation(writer, reasoningItemId); + writer.String("part"); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(responsesStreamingReasoningText.c_str()); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeReasoningOutputItemDoneEvent(const std::string& reasoningItemId) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.done"); + writer.String("output_index"); + writer.Uint64(0); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(reasoningItemId.c_str()); + writer.String("type"); + writer.String("reasoning"); + writer.String("summary"); + writer.StartArray(); + writer.StartObject(); + writer.String("type"); + writer.String("summary_text"); + writer.String("text"); + writer.String(responsesStreamingReasoningText.c_str()); + writer.EndObject(); + writer.EndArray(); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents() { const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); const std::string responseId = "resp-" + std::to_string(createdAt); @@ -1972,57 +2239,16 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents( std::vector events; - events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.created"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.in_progress"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, "in_progress", "", false); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.output_item.added"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("item"); - serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); - writer.EndObject(); - })); + events.emplace_back(serializeResponseCreatedEvent(responseId, createdAt)); + events.emplace_back(serializeResponseInProgressEvent(responseId, createdAt)); - events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.content_part.added"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("content_index"); - writer.Uint64(0); - writer.String("item_id"); - writer.String(outputItemId.c_str()); - writer.String("part"); - serializeResponsesPart(writer, ""); - writer.EndObject(); - })); + // When outputParser is present, defer output item events until first chunk + // because reasoning items need to come before message items + if (outputParser == nullptr) { + events.emplace_back(serializeOutputItemAddedEvent(outputItemId)); + events.emplace_back(serializeContentPartAddedEvent(outputItemId)); + responsesMessageInitialized = true; + } responsesStreamingInitialized = true; @@ -2040,6 +2266,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); const std::string responseId = "resp-" + std::to_string(createdAt); const std::string outputItemId = "msg-0"; + const std::string reasoningItemId = "rs-0"; std::vector events; if (!responsesStreamingInitialized) { @@ -2050,91 +2277,69 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str } } - if (!chunkResponse.empty()) { - responsesStreamingOutputText += chunkResponse; - events.emplace_back(serializeResponsesEvent([this, &chunkResponse, &outputItemId](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.output_text.delta"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("content_index"); - writer.Uint64(0); - writer.String("item_id"); - writer.String(outputItemId.c_str()); - writer.String("delta"); - writer.String(chunkResponse.c_str()); - serializeNotSupportedEmptyArrayField(writer, "logprobs"); - writer.EndObject(); - })); + if (outputParser != nullptr) { + // Use output parser to separate reasoning from content + std::optional delta = outputParser->parseChunk(chunkResponse, areToolsAvailable(), finishReason); + + if (delta.has_value() && delta->HasMember("delta") && (*delta)["delta"].IsObject()) { + const auto& deltaObj = (*delta)["delta"]; + if (deltaObj.HasMember("reasoning_content") && deltaObj["reasoning_content"].IsString()) { + // Reasoning chunk + if (!responsesReasoningInitialized) { + events.emplace_back(serializeReasoningOutputItemAddedEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartAddedEvent(reasoningItemId)); + responsesReasoningInitialized = true; + } + const std::string reasoningText = deltaObj["reasoning_content"].GetString(); + responsesStreamingReasoningText += reasoningText; + events.emplace_back(serializeReasoningSummaryTextDeltaEvent(reasoningItemId, reasoningText)); + } else if (deltaObj.HasMember("content") && deltaObj["content"].IsString()) { + // Content chunk - close reasoning if it was active, init message if needed + if (responsesReasoningInitialized && !responsesReasoningCompleted) { + events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); + responsesReasoningCompleted = true; + } + const uint64_t msgIdx = responsesReasoningInitialized ? 1 : 0; + if (!responsesMessageInitialized) { + events.emplace_back(serializeOutputItemAddedEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartAddedEvent(outputItemId, msgIdx)); + responsesMessageInitialized = true; + } + const std::string contentText = deltaObj["content"].GetString(); + responsesStreamingOutputText += contentText; + events.emplace_back(serializeOutputTextDeltaEvent(outputItemId, contentText, msgIdx)); + } + } + // If delta is nullopt, the parser is accumulating tag tokens - skip + } else { + // No parser - pass through raw text + if (!chunkResponse.empty()) { + responsesStreamingOutputText += chunkResponse; + events.emplace_back(serializeOutputTextDeltaEvent(outputItemId, chunkResponse)); + } } if (finishReason != ov::genai::GenerationFinishReason::NONE) { - events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.output_text.done"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("content_index"); - writer.Uint64(0); - writer.String("item_id"); - writer.String(outputItemId.c_str()); - writer.String("text"); - writer.String(responsesStreamingOutputText.c_str()); - serializeNotSupportedEmptyArrayField(writer, "logprobs"); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &outputItemId](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.content_part.done"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("content_index"); - writer.Uint64(0); - writer.String("item_id"); - writer.String(outputItemId.c_str()); - writer.String("part"); - serializeResponsesPart(writer, responsesStreamingOutputText); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &outputItemId, finishReason](Writer& writer) { - const char* itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; - writer.StartObject(); - writer.String("type"); - writer.String("response.output_item.done"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("output_index"); - writer.Uint64(0); - writer.String("item"); - serializeResponsesOutputItem(writer, outputItemId, responsesStreamingOutputText, itemStatus, true); - writer.EndObject(); - })); - - events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt, finishReason](Writer& writer) { - const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); - const char* responseStatus = isIncomplete ? "incomplete" : "completed"; - const char* eventType = isIncomplete ? "response.incomplete" : "response.completed"; - const char* incompleteReason = isIncomplete ? "max_tokens" : nullptr; - writer.StartObject(); - writer.String("type"); - writer.String(eventType); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, responseStatus, responsesStreamingOutputText, true, incompleteReason); - writer.EndObject(); - })); + // Close any open reasoning that wasn't closed by content transition + if (responsesReasoningInitialized && !responsesReasoningCompleted) { + events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); + responsesReasoningCompleted = true; + } + const uint64_t msgIdx = responsesReasoningInitialized ? 1 : 0; + // Ensure message item is initialized even if no content was produced + if (!responsesMessageInitialized) { + events.emplace_back(serializeOutputItemAddedEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartAddedEvent(outputItemId, msgIdx)); + responsesMessageInitialized = true; + } + events.emplace_back(serializeOutputTextDoneEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartDoneEvent(outputItemId, msgIdx)); + events.emplace_back(serializeOutputItemDoneEvent(outputItemId, finishReason, msgIdx)); + events.emplace_back(serializeResponseCompletedEvent(responseId, createdAt, finishReason)); } if (events.empty()) { @@ -2253,17 +2458,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesFailedEvent(const st } } - events.emplace_back(serializeResponsesEvent([this, &responseId, createdAt, &errorMessage, errorCode](Writer& writer) { - writer.StartObject(); - writer.String("type"); - writer.String("response.failed"); - writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); - writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, "failed", responsesStreamingOutputText, false, - nullptr, errorMessage.c_str(), errorCode); - writer.EndObject(); - })); + events.emplace_back(serializeResponseFailedEventBody(responseId, createdAt, errorMessage, errorCode)); std::stringstream ss; ss << events.front(); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 50d11225d6..f5eb54dd2c 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -74,6 +74,10 @@ class OpenAIChatCompletionsHandler { size_t responsesStreamingSequenceNumber = 1; bool responsesStreamingInitialized = false; std::string responsesStreamingOutputText; + bool responsesReasoningInitialized = false; + bool responsesReasoningCompleted = false; + bool responsesMessageInitialized = false; + std::string responsesStreamingReasoningText; // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning. std::unique_ptr outputParser = nullptr; @@ -99,6 +103,31 @@ class OpenAIChatCompletionsHandler { std::string serializeResponsesUnaryResponse(const std::vector& parsedOutputs, ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP) const; + // Responses API streaming event building blocks + void writeEventHeader(Writer& writer, const char* eventType); + static void writeContentLocation(Writer& writer, const std::string& itemId, uint64_t outputIndex = 0); + static void writeReasoningLocation(Writer& writer, const std::string& itemId); + + // Individual Responses API streaming event serializers + std::string serializeResponseCreatedEvent(const std::string& responseId, int64_t createdAt); + std::string serializeResponseInProgressEvent(const std::string& responseId, int64_t createdAt); + std::string serializeOutputItemAddedEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeContentPartAddedEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeOutputTextDeltaEvent(const std::string& outputItemId, const std::string& delta, uint64_t outputIndex = 0); + std::string serializeOutputTextDoneEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeContentPartDoneEvent(const std::string& outputItemId, uint64_t outputIndex = 0); + std::string serializeOutputItemDoneEvent(const std::string& outputItemId, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex = 0); + std::string serializeResponseCompletedEvent(const std::string& responseId, int64_t createdAt, ov::genai::GenerationFinishReason finishReason); + + // Reasoning streaming event serializers + std::string serializeReasoningOutputItemAddedEvent(const std::string& reasoningItemId); + std::string serializeReasoningSummaryPartAddedEvent(const std::string& reasoningItemId); + std::string serializeReasoningSummaryTextDeltaEvent(const std::string& reasoningItemId, const std::string& delta); + std::string serializeReasoningSummaryTextDoneEvent(const std::string& reasoningItemId); + std::string serializeReasoningSummaryPartDoneEvent(const std::string& reasoningItemId); + std::string serializeReasoningOutputItemDoneEvent(const std::string& reasoningItemId); + std::string serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, const char* errorCode); + public: OpenAIChatCompletionsHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, ov::genai::Tokenizer tokenizer, const std::string& toolParserName = "", const std::string& reasoningParserName = "") : diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 1b6945281b..97c3a1d608 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -905,6 +905,77 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsO ASSERT_NE(serialized.find("\"text\":"), std::string::npos) << serialized; } +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsReasoningOutputItem) { + std::string json = R"({ + "model": "llama", + "input": "Think about this", + "max_output_tokens": 100 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + std::string modelOutput = "Let me reason about thisThe answer is 42"; + ov::Tensor outputIds = tokenizer->encode(modelOutput, ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + // Reasoning output item should be present + ASSERT_NE(serialized.find("\"type\":\"reasoning\""), std::string::npos) << "Reasoning output item missing: " << serialized; + ASSERT_NE(serialized.find("\"type\":\"summary_text\""), std::string::npos) << "Summary text missing: " << serialized; + // Reasoning item should NOT have status field (per OpenAI spec) + auto reasoningPos = serialized.find("\"type\":\"reasoning\""); + auto messagePos = serialized.find("\"type\":\"message\""); + ASSERT_LT(reasoningPos, messagePos) << "Reasoning item should come before message item"; + // Reasoning item ID should start with rs- + ASSERT_NE(serialized.find("\"id\":\"rs-"), std::string::npos) << serialized; + // Message output item should still be present with content + ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesOmitsReasoningWhenAbsent) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS is great", ov::genai::add_special_tokens(false)).input_ids; + ASSERT_EQ(outputIds.get_shape().size(), 2); + ASSERT_EQ(outputIds.get_shape()[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; + // No reasoning output item when model output has no tags + ASSERT_EQ(serialized.find("\"type\":\"reasoning\""), std::string::npos) << "Reasoning item should not be present: " << serialized; + // Message item should still be present + ASSERT_NE(serialized.find("\"type\":\"message\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; +} + TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContainsRequiredEvents) { std::string json = R"({ "model": "llama", @@ -969,6 +1040,107 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContains ASSERT_LT(itemDonePos, completedPos) << "output_item.done must come before response.completed"; } +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesWithReasoningEmitsReasoningEvents) { + std::string json = R"({ + "model": "llama", + "input": "Think about this", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Phase 1: Init events - should only have created + in_progress (output items deferred) + std::string initChunk = apiHandler->serializeResponsesStreamingInitEvents(); + ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; + ASSERT_NE(initChunk.find("\"type\":\"response.in_progress\""), std::string::npos) << initChunk; + // Output item events should be deferred when parser is present + ASSERT_EQ(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "output_item.added should be deferred: " << initChunk; + ASSERT_EQ(initChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << "content_part.added should be deferred: " << initChunk; + + // Phase 2: Reasoning chunk with tag - should emit reasoning init + delta + std::string reasoningChunk = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + // tag itself should be consumed by parser, no events + // (parser returns nullopt for tag tokens) + + // Phase 3: Reasoning content + std::string reasoningContent = apiHandler->serializeStreamingChunk("Let me think", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(reasoningContent.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should have reasoning output_item.added: " << reasoningContent; + ASSERT_NE(reasoningContent.find("\"type\":\"reasoning\""), std::string::npos) << "Output item should be reasoning type: " << reasoningContent; + ASSERT_NE(reasoningContent.find("\"type\":\"response.reasoning_summary_part.added\""), std::string::npos) << reasoningContent; + ASSERT_NE(reasoningContent.find("\"type\":\"response.reasoning_summary_text.delta\""), std::string::npos) << reasoningContent; + ASSERT_NE(reasoningContent.find("\"delta\":\"Let me think\""), std::string::npos) << reasoningContent; + + // Phase 4: More reasoning + std::string moreReasoning = apiHandler->serializeStreamingChunk(" harder", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(moreReasoning.find("\"type\":\"response.reasoning_summary_text.delta\""), std::string::npos) << moreReasoning; + ASSERT_NE(moreReasoning.find("\"delta\":\" harder\""), std::string::npos) << moreReasoning; + // Should NOT have another output_item.added + ASSERT_EQ(moreReasoning.find("\"type\":\"response.output_item.added\""), std::string::npos) << "No repeated init: " << moreReasoning; + + // Phase 5: End of reasoning with + std::string endThink = apiHandler->serializeStreamingChunk("", ov::genai::GenerationFinishReason::NONE); + // tag consumed by parser + + // Phase 6: Content chunk - should close reasoning and open message + std::string contentChunk = apiHandler->serializeStreamingChunk("The answer", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(contentChunk.find("\"type\":\"response.reasoning_summary_text.done\""), std::string::npos) << "Should close reasoning: " << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.reasoning_summary_part.done\""), std::string::npos) << contentChunk; + // Message item should be at output_index 1 + ASSERT_NE(contentChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should add message item: " << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << contentChunk; + + // Phase 7: Final chunk + std::string finalChunk = apiHandler->serializeStreamingChunk(" is 42", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_text.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.content_part.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.output_item.done\""), std::string::npos) << finalChunk; + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; + // Completed event should contain reasoning in output + ASSERT_NE(finalChunk.find("\"type\":\"reasoning\""), std::string::npos) << "Completed response should include reasoning: " << finalChunk; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesWithoutReasoningWorksNormally) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "stream": true + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Init events should be deferred (parser present) + std::string initChunk = apiHandler->serializeResponsesStreamingInitEvents(); + ASSERT_NE(initChunk.find("\"type\":\"response.created\""), std::string::npos) << initChunk; + ASSERT_EQ(initChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should be deferred: " << initChunk; + + // Content without reasoning - should emit message init events on first content + std::string contentChunk = apiHandler->serializeStreamingChunk("Hello", ov::genai::GenerationFinishReason::NONE); + ASSERT_NE(contentChunk.find("\"type\":\"response.output_item.added\""), std::string::npos) << "Should init message: " << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.content_part.added\""), std::string::npos) << contentChunk; + ASSERT_NE(contentChunk.find("\"type\":\"response.output_text.delta\""), std::string::npos) << contentChunk; + // Should NOT have any reasoning events + ASSERT_EQ(contentChunk.find("\"type\":\"reasoning\""), std::string::npos) << "No reasoning: " << contentChunk; + ASSERT_EQ(contentChunk.find("\"type\":\"response.reasoning_summary"), std::string::npos) << "No reasoning: " << contentChunk; + + // Final chunk + std::string finalChunk = apiHandler->serializeStreamingChunk(" world", ov::genai::GenerationFinishReason::STOP); + ASSERT_NE(finalChunk.find("\"type\":\"response.completed\""), std::string::npos) << finalChunk; +} + TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingUsageChunkForResponsesIsEmpty) { std::string json = R"({ "model": "llama", @@ -1067,13 +1239,10 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesEmitsCom // Response status should be "completed" ASSERT_NE(finalChunk.find("\"status\":\"completed\""), std::string::npos) << finalChunk; - // Should contain new spec-aligned fields + // Should contain spec-aligned fields ASSERT_NE(finalChunk.find("\"error\":null"), std::string::npos) << "Should contain error:null: " << finalChunk; - ASSERT_NE(finalChunk.find("\"previous_response_id\":null"), std::string::npos) << finalChunk; - ASSERT_NE(finalChunk.find("\"reasoning\":null"), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"store\":true"), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"truncation\":\"disabled\""), std::string::npos) << finalChunk; - ASSERT_NE(finalChunk.find("\"user\":null"), std::string::npos) << finalChunk; ASSERT_NE(finalChunk.find("\"metadata\":{}"), std::string::npos) << finalChunk; } From 2d1b3502c1a8583ec83551a2a7c4b34db9c7feca Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Sun, 22 Mar 2026 11:41:21 +0100 Subject: [PATCH 18/24] style --- src/llm/apis/openai_completions.cpp | 34 ++++++++++ src/test/http_openai_handler_test.cpp | 97 +++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 0d6b879359..58c41fcc6a 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -1251,6 +1251,40 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesPart(std::optionalvalue.IsNull()) { + if (!it->value.IsObject()) { + return absl::InvalidArgumentError("reasoning is not an object"); + } + const auto& reasoningObj = it->value; + auto effortIt = reasoningObj.FindMember("effort"); + if (effortIt != reasoningObj.MemberEnd() && !effortIt->value.IsNull()) { + if (!effortIt->value.IsString()) { + return absl::InvalidArgumentError("reasoning.effort is not a string"); + } + const std::string effort = effortIt->value.GetString(); + if (effort != "low" && effort != "medium" && effort != "high") { + return absl::InvalidArgumentError("reasoning.effort must be one of: low, medium, high"); + } + // Inject enable_thinking: true into chat_template_kwargs if not already explicitly set + auto kwargsIt = doc.FindMember("chat_template_kwargs"); + if (kwargsIt == doc.MemberEnd()) { + rapidjson::Value kwargs(rapidjson::kObjectType); + kwargs.AddMember("enable_thinking", true, doc.GetAllocator()); + doc.AddMember("chat_template_kwargs", kwargs, doc.GetAllocator()); + } else if (kwargsIt->value.IsObject()) { + auto enableThinkingIt = kwargsIt->value.FindMember("enable_thinking"); + if (enableThinkingIt == kwargsIt->value.MemberEnd()) { + kwargsIt->value.AddMember("enable_thinking", true, doc.GetAllocator()); + } + // If enable_thinking is already set explicitly, the user's value takes precedence + } + } + // summary field is accepted but ignored + } + #if (PYTHON_DISABLE == 0) // Build processedJson with "messages" array from chatHistory so that // the Python chat template path (which reads request_json["messages"]) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 97c3a1d608..9e6d8cff88 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -976,6 +976,103 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesOmitsReas ASSERT_NE(serialized.find("\"type\":\"output_text\""), std::string::npos) << serialized; } +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterInjectsEnableThinking) { + std::string json = R"({ + "model": "llama", + "input": "Think about this", + "reasoning": {"effort": "high", "summary": "auto"}, + "max_output_tokens": 100 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer, "", "qwen3"); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // Verify that chat_template_kwargs was injected with enable_thinking: true + auto chatTemplateKwargsStatus = apiHandler->parseChatTemplateKwargsToJsonContainer(); + ASSERT_TRUE(chatTemplateKwargsStatus.ok()); + ASSERT_TRUE(chatTemplateKwargsStatus.value().has_value()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterAllEffortValuesWork) { + for (const auto& effort : {"low", "medium", "high"}) { + std::string json = R"({"model": "llama", "input": "test", "reasoning": {"effort": ")" + std::string(effort) + R"("}, "max_output_tokens": 10})"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()) << json; + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()) << "Failed for effort: " << effort; + + auto chatTemplateKwargsStatus = apiHandler->parseChatTemplateKwargsToJsonContainer(); + ASSERT_TRUE(chatTemplateKwargsStatus.ok()); + ASSERT_TRUE(chatTemplateKwargsStatus.value().has_value()) << "enable_thinking not injected for effort: " << effort; + } +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterInvalidEffortRejected) { + std::string json = R"({ + "model": "llama", + "input": "test", + "reasoning": {"effort": "invalid"}, + "max_output_tokens": 10 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_NE(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterDoesNotOverrideExplicitKwargs) { + std::string json = R"({ + "model": "llama", + "input": "test", + "reasoning": {"effort": "high"}, + "chat_template_kwargs": {"enable_thinking": false}, + "max_output_tokens": 10 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // chat_template_kwargs should exist, but the explicit enable_thinking: false should be preserved + auto chatTemplateKwargsStatus = apiHandler->parseChatTemplateKwargsToJsonContainer(); + ASSERT_TRUE(chatTemplateKwargsStatus.ok()); + ASSERT_TRUE(chatTemplateKwargsStatus.value().has_value()); +} + +TEST_F(HttpOpenAIHandlerParsingTest, parseResponsesReasoningParameterNotAnObjectRejected) { + std::string json = R"({ + "model": "llama", + "input": "test", + "reasoning": "high", + "max_output_tokens": 10 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_NE(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); +} + TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkForResponsesContainsRequiredEvents) { std::string json = R"({ "model": "llama", From eddfbd9ed4f0a09e931d34ed14bc781dfe6dc797 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 23 Mar 2026 08:36:20 +0100 Subject: [PATCH 19/24] fix --- .../accuracy/gorilla.patch | 632 +++++++++++++++++- src/llm/apis/openai_completions.cpp | 233 ++++++- src/llm/apis/openai_completions.hpp | 27 +- 3 files changed, 846 insertions(+), 46 deletions(-) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 48af0f5eac..1bb4cf984f 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -1,8 +1,17 @@ diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -index bb625d2..7204adb 100644 +index bb625d2..d06483d 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -@@ -2153,6 +2153,30 @@ third_party_inference_model_map = { +@@ -24,6 +24,8 @@ from bfcl_eval.model_handler.api_inference.openai_completion import ( + OpenAICompletionsHandler, + ) + from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler ++from bfcl_eval.model_handler.api_inference.ovms_response import OVMSResponsesHandler ++from bfcl_eval.model_handler.api_inference.ovms_response_stream import OVMSResponsesStreamHandler + from bfcl_eval.model_handler.api_inference.qwen import ( + QwenAgentNoThinkHandler, + QwenAgentThinkHandler, +@@ -2153,6 +2155,54 @@ third_party_inference_model_map = { is_fc_model=True, underscore_to_dot=True, ), @@ -29,6 +38,30 @@ index bb625d2..7204adb 100644 + output_price=None, + is_fc_model=True, + underscore_to_dot=True, ++ ), ++ "ovms-model-responses": ModelConfig( ++ model_name="ovms-model-responses", ++ display_name="ovms-model-responses", ++ url="http://localhost:8000/v3", ++ org="ovms", ++ license="apache-2.0", ++ model_handler=OVMSResponsesHandler, ++ input_price=None, ++ output_price=None, ++ is_fc_model=True, ++ underscore_to_dot=True, ++ ), ++ "ovms-model-stream-responses": ModelConfig( ++ model_name="ovms-model-stream-responses", ++ display_name="ovms-model-stream-responses", ++ url="http://localhost:8000/v3", ++ org="ovms", ++ license="apache-2.0", ++ model_handler=OVMSResponsesStreamHandler, ++ input_price=None, ++ output_price=None, ++ is_fc_model=True, ++ underscore_to_dot=True, + ), } @@ -103,3 +136,598 @@ index 10f1a08..50890c7 100644 - }) \ No newline at end of file + }) +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py +new file mode 100644 +index 0000000..55d480d +--- /dev/null ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py +@@ -0,0 +1,270 @@ ++import json ++import os ++import time ++ ++from bfcl_eval.constants.type_mappings import GORILLA_TO_OPENAPI ++from bfcl_eval.model_handler.base_handler import BaseHandler ++from bfcl_eval.constants.enums import ModelStyle ++from bfcl_eval.model_handler.utils import ( ++ convert_to_function_call, ++ convert_to_tool, ++ default_decode_ast_prompting, ++ default_decode_execute_prompting, ++ format_execution_results_prompting, ++ retry_with_backoff, ++ system_prompt_pre_processing_chat_model, ++) ++from openai import OpenAI, RateLimitError ++from openai.types.responses import Response ++ ++ ++class OVMSResponsesHandler(BaseHandler): ++ """Handler for testing OVMS via the OpenAI Responses API (/v3/responses).""" ++ ++ def __init__( ++ self, ++ model_name, ++ temperature, ++ registry_name, ++ is_fc_model, ++ **kwargs, ++ ) -> None: ++ super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) ++ self.model_style = ModelStyle.OPENAI_RESPONSES ++ self.client = OpenAI(**self._build_client_kwargs()) ++ ++ def _build_client_kwargs(self): ++ kwargs = {} ++ ++ if api_key := os.getenv("OPENAI_API_KEY", "unused"): ++ kwargs["api_key"] = api_key ++ ++ if base_url := os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v3"): ++ kwargs["base_url"] = base_url ++ ++ if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): ++ kwargs["default_headers"] = json.loads(headers_env) ++ ++ return kwargs ++ ++ @staticmethod ++ def _substitute_prompt_role(prompts: list[dict]) -> list[dict]: ++ # OVMS responses API accepts "system" role, but following OpenAI convention ++ # we convert to "developer" for compatibility. ++ for prompt in prompts: ++ if prompt["role"] == "system": ++ prompt["role"] = "developer" ++ return prompts ++ ++ def decode_ast(self, result, language, has_tool_call_tag): ++ if self.is_fc_model: ++ decoded_output = [] ++ for invoked_function in result: ++ name = list(invoked_function.keys())[0] ++ params = json.loads(invoked_function[name]) ++ decoded_output.append({name: params}) ++ return decoded_output ++ else: ++ return default_decode_ast_prompting(result, language, has_tool_call_tag) ++ ++ def decode_execute(self, result, has_tool_call_tag): ++ if self.is_fc_model: ++ return convert_to_function_call(result) ++ else: ++ return default_decode_execute_prompting(result, has_tool_call_tag) ++ ++ @retry_with_backoff(error_type=RateLimitError) ++ def generate_with_backoff(self, **kwargs): ++ start_time = time.time() ++ api_response = self.client.responses.create(**kwargs) ++ end_time = time.time() ++ ++ return api_response, end_time - start_time ++ ++ #### FC methods #### ++ ++ def _query_FC(self, inference_data: dict): ++ message: list[dict] = inference_data["message"] ++ tools = inference_data["tools"] ++ ++ inference_data["inference_input_log"] = { ++ "message": repr(message), ++ "tools": tools, ++ } ++ ++ kwargs = { ++ "input": message, ++ "model": self.model_name, ++ "temperature": self.temperature, ++ "max_output_tokens": 2048, ++ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, ++ } ++ ++ if len(tools) > 0: ++ kwargs["tools"] = tools ++ ++ return self.generate_with_backoff(**kwargs) ++ ++ def _pre_query_processing_FC(self, inference_data: dict, test_entry: dict) -> dict: ++ for round_idx in range(len(test_entry["question"])): ++ test_entry["question"][round_idx] = self._substitute_prompt_role( ++ test_entry["question"][round_idx] ++ ) ++ ++ inference_data["message"] = [] ++ ++ return inference_data ++ ++ def _compile_tools(self, inference_data: dict, test_entry: dict) -> dict: ++ functions: list = test_entry["function"] ++ ++ tools = convert_to_tool(functions, GORILLA_TO_OPENAPI, self.model_style) ++ ++ inference_data["tools"] = tools ++ ++ return inference_data ++ ++ def _parse_query_response_FC(self, api_response: Response) -> dict: ++ model_responses = [] ++ tool_call_ids = [] ++ ++ for func_call in api_response.output: ++ if func_call.type == "function_call": ++ model_responses.append({func_call.name: func_call.arguments}) ++ tool_call_ids.append(func_call.call_id) ++ ++ if not model_responses: # If there are no function calls ++ model_responses = api_response.output_text ++ ++ reasoning_content = "" ++ for item in api_response.output: ++ if item.type == "reasoning": ++ for summary in item.summary: ++ reasoning_content += summary.text + "\n" ++ ++ return { ++ "model_responses": model_responses, ++ "model_responses_message_for_chat_history": api_response.output, ++ "tool_call_ids": tool_call_ids, ++ "reasoning_content": reasoning_content, ++ "input_token": api_response.usage.input_tokens, ++ "output_token": api_response.usage.output_tokens, ++ } ++ ++ def add_first_turn_message_FC( ++ self, inference_data: dict, first_turn_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(first_turn_message) ++ return inference_data ++ ++ def _add_next_turn_user_message_FC( ++ self, inference_data: dict, user_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(user_message) ++ return inference_data ++ ++ def _add_assistant_message_FC( ++ self, inference_data: dict, model_response_data: dict ++ ) -> dict: ++ inference_data["message"].extend( ++ model_response_data["model_responses_message_for_chat_history"] ++ ) ++ return inference_data ++ ++ def _add_execution_results_FC( ++ self, ++ inference_data: dict, ++ execution_results: list[str], ++ model_response_data: dict, ++ ) -> dict: ++ for execution_result, tool_call_id in zip( ++ execution_results, model_response_data["tool_call_ids"] ++ ): ++ tool_message = { ++ "type": "function_call_output", ++ "call_id": tool_call_id, ++ "output": execution_result, ++ } ++ inference_data["message"].append(tool_message) ++ ++ return inference_data ++ ++ #### Prompting methods #### ++ ++ def _query_prompting(self, inference_data: dict): ++ inference_data["inference_input_log"] = {"message": repr(inference_data["message"])} ++ ++ kwargs = { ++ "input": inference_data["message"], ++ "model": self.model_name, ++ "temperature": self.temperature, ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, ++ } ++ ++ return self.generate_with_backoff(**kwargs) ++ ++ def _pre_query_processing_prompting(self, test_entry: dict) -> dict: ++ functions: list = test_entry["function"] ++ test_entry_id: str = test_entry["id"] ++ ++ test_entry["question"][0] = system_prompt_pre_processing_chat_model( ++ test_entry["question"][0], functions, test_entry_id ++ ) ++ ++ for round_idx in range(len(test_entry["question"])): ++ test_entry["question"][round_idx] = self._substitute_prompt_role( ++ test_entry["question"][round_idx] ++ ) ++ ++ return {"message": []} ++ ++ def _parse_query_response_prompting(self, api_response: Response) -> dict: ++ reasoning_content = "" ++ for item in api_response.output: ++ if item.type == "reasoning": ++ for summary in item.summary: ++ reasoning_content += summary.text + "\n" ++ ++ return { ++ "model_responses": api_response.output_text, ++ "model_responses_message_for_chat_history": api_response.output, ++ "reasoning_content": reasoning_content, ++ "input_token": api_response.usage.input_tokens, ++ "output_token": api_response.usage.output_tokens, ++ } ++ ++ def add_first_turn_message_prompting( ++ self, inference_data: dict, first_turn_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(first_turn_message) ++ return inference_data ++ ++ def _add_next_turn_user_message_prompting( ++ self, inference_data: dict, user_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(user_message) ++ return inference_data ++ ++ def _add_assistant_message_prompting( ++ self, inference_data: dict, model_response_data: dict ++ ) -> dict: ++ inference_data["message"].extend( ++ model_response_data["model_responses_message_for_chat_history"] ++ ) ++ return inference_data ++ ++ def _add_execution_results_prompting( ++ self, ++ inference_data: dict, ++ execution_results: list[str], ++ model_response_data: dict, ++ ) -> dict: ++ formatted_results_message = format_execution_results_prompting( ++ inference_data, execution_results, model_response_data ++ ) ++ inference_data["message"].append( ++ {"role": "user", "content": formatted_results_message} ++ ) ++ ++ return inference_data +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py +new file mode 100644 +index 0000000..89c9a65 +--- /dev/null ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py +@@ -0,0 +1,313 @@ ++import json ++import os ++import time ++ ++from bfcl_eval.constants.type_mappings import GORILLA_TO_OPENAPI ++from bfcl_eval.model_handler.base_handler import BaseHandler ++from bfcl_eval.constants.enums import ModelStyle ++from bfcl_eval.model_handler.utils import ( ++ convert_to_function_call, ++ convert_to_tool, ++ default_decode_ast_prompting, ++ default_decode_execute_prompting, ++ format_execution_results_prompting, ++ retry_with_backoff, ++ system_prompt_pre_processing_chat_model, ++) ++from openai import OpenAI, RateLimitError ++ ++ ++class OVMSResponsesStreamHandler(BaseHandler): ++ """Handler for testing OVMS via the OpenAI Responses API with streaming (/v3/responses, stream=True).""" ++ ++ def __init__( ++ self, ++ model_name, ++ temperature, ++ registry_name, ++ is_fc_model, ++ **kwargs, ++ ) -> None: ++ super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) ++ self.model_style = ModelStyle.OPENAI_RESPONSES ++ self.client = OpenAI(**self._build_client_kwargs()) ++ ++ def _build_client_kwargs(self): ++ kwargs = {} ++ ++ if api_key := os.getenv("OPENAI_API_KEY", "unused"): ++ kwargs["api_key"] = api_key ++ ++ if base_url := os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v3"): ++ kwargs["base_url"] = base_url ++ ++ if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): ++ kwargs["default_headers"] = json.loads(headers_env) ++ ++ return kwargs ++ ++ @staticmethod ++ def _substitute_prompt_role(prompts: list[dict]) -> list[dict]: ++ for prompt in prompts: ++ if prompt["role"] == "system": ++ prompt["role"] = "developer" ++ return prompts ++ ++ def decode_ast(self, result, language, has_tool_call_tag): ++ if self.is_fc_model: ++ decoded_output = [] ++ for invoked_function in result: ++ name = list(invoked_function.keys())[0] ++ params = json.loads(invoked_function[name]) ++ decoded_output.append({name: params}) ++ return decoded_output ++ else: ++ return default_decode_ast_prompting(result, language, has_tool_call_tag) ++ ++ def decode_execute(self, result, has_tool_call_tag): ++ if self.is_fc_model: ++ return convert_to_function_call(result) ++ else: ++ return default_decode_execute_prompting(result, has_tool_call_tag) ++ ++ @retry_with_backoff(error_type=RateLimitError) ++ def generate_with_backoff(self, **kwargs): ++ start_time = time.time() ++ api_response = self.client.responses.create(**kwargs) ++ end_time = time.time() ++ ++ return api_response, end_time - start_time ++ ++ @staticmethod ++ def _parse_stream(stream) -> dict: ++ """Parse responses API SSE stream and return aggregated results.""" ++ text_content = "" ++ reasoning_content = "" ++ tool_calls = {} # keyed by call_id ++ usage = {"input_tokens": 0, "output_tokens": 0} ++ output_items = [] # final output items from response.completed ++ ++ for event in stream: ++ event_type = event.type ++ ++ if event_type == "response.output_text.delta": ++ text_content += event.delta or "" ++ ++ elif event_type == "response.reasoning.delta": ++ reasoning_content += event.delta or "" ++ ++ elif event_type == "response.function_call_arguments.delta": ++ item_id = event.item_id ++ if item_id not in tool_calls: ++ tool_calls[item_id] = {"call_id": "", "name": "", "arguments": ""} ++ tool_calls[item_id]["arguments"] += event.delta or "" ++ ++ elif event_type == "response.output_item.added": ++ item = event.item ++ if hasattr(item, "type") and item.type == "function_call": ++ item_id = item.id ++ tool_calls[item_id] = { ++ "call_id": getattr(item, "call_id", "") or "", ++ "name": getattr(item, "name", "") or "", ++ "arguments": "", ++ } ++ ++ elif event_type in ("response.completed", "response.incomplete"): ++ resp = event.response ++ if hasattr(resp, "usage") and resp.usage: ++ usage["input_tokens"] = resp.usage.input_tokens ++ usage["output_tokens"] = resp.usage.output_tokens ++ if hasattr(resp, "output"): ++ output_items = resp.output ++ ++ return { ++ "text": text_content, ++ "reasoning": reasoning_content, ++ "tool_calls": tool_calls, ++ "usage": usage, ++ "output_items": output_items, ++ } ++ ++ #### FC methods #### ++ ++ def _query_FC(self, inference_data: dict): ++ message: list[dict] = inference_data["message"] ++ tools = inference_data["tools"] ++ ++ inference_data["inference_input_log"] = { ++ "message": repr(message), ++ "tools": tools, ++ } ++ ++ kwargs = { ++ "input": message, ++ "model": self.model_name, ++ "temperature": self.temperature, ++ "max_output_tokens": 2048, ++ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, ++ "stream": True, ++ } ++ ++ if len(tools) > 0: ++ kwargs["tools"] = tools ++ ++ return self.generate_with_backoff(**kwargs) ++ ++ def _pre_query_processing_FC(self, inference_data: dict, test_entry: dict) -> dict: ++ for round_idx in range(len(test_entry["question"])): ++ test_entry["question"][round_idx] = self._substitute_prompt_role( ++ test_entry["question"][round_idx] ++ ) ++ ++ inference_data["message"] = [] ++ ++ return inference_data ++ ++ def _compile_tools(self, inference_data: dict, test_entry: dict) -> dict: ++ functions: list = test_entry["function"] ++ ++ tools = convert_to_tool(functions, GORILLA_TO_OPENAPI, self.model_style) ++ ++ inference_data["tools"] = tools ++ ++ return inference_data ++ ++ def _parse_query_response_FC(self, api_response) -> dict: ++ parsed = self._parse_stream(api_response) ++ ++ model_responses = [] ++ tool_call_ids = [] ++ ++ for item_id, tc in parsed["tool_calls"].items(): ++ model_responses.append({tc["name"]: tc["arguments"]}) ++ tool_call_ids.append(tc["call_id"]) ++ ++ if not model_responses: ++ model_responses = parsed["text"] ++ ++ # Use the output_items from the completed event for chat history ++ output_items = parsed["output_items"] ++ ++ return { ++ "model_responses": model_responses, ++ "model_responses_message_for_chat_history": output_items, ++ "tool_call_ids": tool_call_ids, ++ "reasoning_content": parsed["reasoning"], ++ "input_token": parsed["usage"]["input_tokens"], ++ "output_token": parsed["usage"]["output_tokens"], ++ } ++ ++ def add_first_turn_message_FC( ++ self, inference_data: dict, first_turn_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(first_turn_message) ++ return inference_data ++ ++ def _add_next_turn_user_message_FC( ++ self, inference_data: dict, user_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(user_message) ++ return inference_data ++ ++ def _add_assistant_message_FC( ++ self, inference_data: dict, model_response_data: dict ++ ) -> dict: ++ inference_data["message"].extend( ++ model_response_data["model_responses_message_for_chat_history"] ++ ) ++ return inference_data ++ ++ def _add_execution_results_FC( ++ self, ++ inference_data: dict, ++ execution_results: list[str], ++ model_response_data: dict, ++ ) -> dict: ++ for execution_result, tool_call_id in zip( ++ execution_results, model_response_data["tool_call_ids"] ++ ): ++ tool_message = { ++ "type": "function_call_output", ++ "call_id": tool_call_id, ++ "output": execution_result, ++ } ++ inference_data["message"].append(tool_message) ++ ++ return inference_data ++ ++ #### Prompting methods #### ++ ++ def _query_prompting(self, inference_data: dict): ++ inference_data["inference_input_log"] = {"message": repr(inference_data["message"])} ++ ++ kwargs = { ++ "input": inference_data["message"], ++ "model": self.model_name, ++ "temperature": self.temperature, ++ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, ++ "stream": True, ++ } ++ ++ return self.generate_with_backoff(**kwargs) ++ ++ def _pre_query_processing_prompting(self, test_entry: dict) -> dict: ++ functions: list = test_entry["function"] ++ test_entry_id: str = test_entry["id"] ++ ++ test_entry["question"][0] = system_prompt_pre_processing_chat_model( ++ test_entry["question"][0], functions, test_entry_id ++ ) ++ ++ for round_idx in range(len(test_entry["question"])): ++ test_entry["question"][round_idx] = self._substitute_prompt_role( ++ test_entry["question"][round_idx] ++ ) ++ ++ return {"message": []} ++ ++ def _parse_query_response_prompting(self, api_response) -> dict: ++ parsed = self._parse_stream(api_response) ++ ++ return { ++ "model_responses": parsed["text"], ++ "model_responses_message_for_chat_history": parsed["output_items"], ++ "reasoning_content": parsed["reasoning"], ++ "input_token": parsed["usage"]["input_tokens"], ++ "output_token": parsed["usage"]["output_tokens"], ++ } ++ ++ def add_first_turn_message_prompting( ++ self, inference_data: dict, first_turn_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(first_turn_message) ++ return inference_data ++ ++ def _add_next_turn_user_message_prompting( ++ self, inference_data: dict, user_message: list[dict] ++ ) -> dict: ++ inference_data["message"].extend(user_message) ++ return inference_data ++ ++ def _add_assistant_message_prompting( ++ self, inference_data: dict, model_response_data: dict ++ ) -> dict: ++ inference_data["message"].extend( ++ model_response_data["model_responses_message_for_chat_history"] ++ ) ++ return inference_data ++ ++ def _add_execution_results_prompting( ++ self, ++ inference_data: dict, ++ execution_results: list[str], ++ model_response_data: dict, ++ ) -> dict: ++ formatted_results_message = format_execution_results_prompting( ++ inference_data, execution_results, model_response_data ++ ) ++ inference_data["message"].append( ++ {"role": "user", "content": formatted_results_message} ++ ) ++ ++ return inference_data diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 58c41fcc6a..a9af75fedf 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -209,7 +209,7 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer& writer writer.String("type"); writer.String(eventType); writer.String("sequence_number"); - writer.Uint64(responsesStreamingSequenceNumber++); + writer.Uint64(responsesState.sequenceNumber++); } void OpenAIChatCompletionsHandler::writeContentLocation(Writer& writer, const std::string& itemId, uint64_t outputIndex) { @@ -2111,7 +2128,7 @@ std::string OpenAIChatCompletionsHandler::serializeOutputTextDoneEvent(const std writeEventHeader(writer, "response.output_text.done"); writeContentLocation(writer, outputItemId, outputIndex); writer.String("text"); - writer.String(responsesStreamingOutputText.c_str()); + writer.String(responsesState.outputText.c_str()); // TODO: logprobs not supported writer.EndObject(); return buffer.GetString(); @@ -2123,7 +2140,7 @@ std::string OpenAIChatCompletionsHandler::serializeContentPartDoneEvent(const st writeEventHeader(writer, "response.content_part.done"); writeContentLocation(writer, outputItemId, outputIndex); writer.String("part"); - serializeResponsesPart(writer, responsesStreamingOutputText); + serializeResponsesPart(writer, responsesState.outputText); writer.EndObject(); return buffer.GetString(); } @@ -2136,7 +2153,7 @@ std::string OpenAIChatCompletionsHandler::serializeOutputItemDoneEvent(const std writer.String("output_index"); writer.Uint64(outputIndex); writer.String("item"); - serializeResponsesOutputItem(writer, outputItemId, responsesStreamingOutputText, itemStatus, true); + serializeResponsesOutputItem(writer, outputItemId, responsesState.outputText, itemStatus, true); writer.EndObject(); return buffer.GetString(); } @@ -2150,7 +2167,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponseCompletedEvent(const Writer writer(buffer); writeEventHeader(writer, eventType); writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, responseStatus, responsesStreamingOutputText, true, incompleteReason); + serializeResponsesResponseObject(writer, responseId, createdAt, responseStatus, responsesState.outputText, true, incompleteReason); writer.EndObject(); return buffer.GetString(); } @@ -2160,7 +2177,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponseFailedEventBody(const Writer writer(buffer); writeEventHeader(writer, "response.failed"); writer.String("response"); - serializeResponsesResponseObject(writer, responseId, createdAt, "failed", responsesStreamingOutputText, false, + serializeResponsesResponseObject(writer, responseId, createdAt, "failed", responsesState.outputText, false, nullptr, errorMessage.c_str(), errorCode); writer.EndObject(); return buffer.GetString(); @@ -2219,7 +2236,7 @@ std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryTextDoneEvent writeEventHeader(writer, "response.reasoning_summary_text.done"); writeReasoningLocation(writer, reasoningItemId); writer.String("text"); - writer.String(responsesStreamingReasoningText.c_str()); + writer.String(responsesState.reasoningText.c_str()); writer.EndObject(); return buffer.GetString(); } @@ -2234,7 +2251,7 @@ std::string OpenAIChatCompletionsHandler::serializeReasoningSummaryPartDoneEvent writer.String("type"); writer.String("summary_text"); writer.String("text"); - writer.String(responsesStreamingReasoningText.c_str()); + writer.String(responsesState.reasoningText.c_str()); writer.EndObject(); writer.EndObject(); return buffer.GetString(); @@ -2258,7 +2275,7 @@ std::string OpenAIChatCompletionsHandler::serializeReasoningOutputItemDoneEvent( writer.String("type"); writer.String("summary_text"); writer.String("text"); - writer.String(responsesStreamingReasoningText.c_str()); + writer.String(responsesState.reasoningText.c_str()); writer.EndObject(); writer.EndArray(); writer.EndObject(); @@ -2266,6 +2283,89 @@ std::string OpenAIChatCompletionsHandler::serializeReasoningOutputItemDoneEvent( return buffer.GetString(); } +std::string OpenAIChatCompletionsHandler::serializeFunctionCallOutputItemAddedEvent(const ToolCall& toolCall, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.added"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(toolCall.id.c_str()); + writer.String("type"); + writer.String("function_call"); + writer.String("status"); + writer.String("in_progress"); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("name"); + writer.String(toolCall.name.c_str()); + writer.String("arguments"); + writer.String(""); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeFunctionCallArgumentsDeltaEvent(const std::string& callId, const std::string& delta, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.function_call_arguments.delta"); + writer.String("item_id"); + writer.String(callId.c_str()); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("call_id"); + writer.String(callId.c_str()); + writer.String("delta"); + writer.String(delta.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeFunctionCallArgumentsDoneEvent(const ToolCall& toolCall, uint64_t outputIndex) { + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.function_call_arguments.done"); + writer.String("item_id"); + writer.String(toolCall.id.c_str()); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("arguments"); + writer.String(toolCall.arguments.c_str()); + writer.EndObject(); + return buffer.GetString(); +} + +std::string OpenAIChatCompletionsHandler::serializeFunctionCallOutputItemDoneEvent(const ToolCall& toolCall, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex) { + const char* itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; + StringBuffer buffer; + Writer writer(buffer); + writeEventHeader(writer, "response.output_item.done"); + writer.String("output_index"); + writer.Uint64(outputIndex); + writer.String("item"); + writer.StartObject(); + writer.String("id"); + writer.String(toolCall.id.c_str()); + writer.String("type"); + writer.String("function_call"); + writer.String("status"); + writer.String(itemStatus); + writer.String("call_id"); + writer.String(toolCall.id.c_str()); + writer.String("name"); + writer.String(toolCall.name.c_str()); + writer.String("arguments"); + writer.String(toolCall.arguments.c_str()); + writer.EndObject(); + writer.EndObject(); + return buffer.GetString(); +} + std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents() { const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); const std::string responseId = "resp-" + std::to_string(createdAt); @@ -2281,10 +2381,10 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesStreamingInitEvents( if (outputParser == nullptr) { events.emplace_back(serializeOutputItemAddedEvent(outputItemId)); events.emplace_back(serializeContentPartAddedEvent(outputItemId)); - responsesMessageInitialized = true; + responsesState.messageInitialized = true; } - responsesStreamingInitialized = true; + responsesState.initialized = true; std::stringstream ss; ss << events.front(); @@ -2303,7 +2403,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str const std::string reasoningItemId = "rs-0"; std::vector events; - if (!responsesStreamingInitialized) { + if (!responsesState.initialized) { // Fallback: if init events were not sent earlier, emit them now std::string initEvents = serializeResponsesStreamingInitEvents(); if (!initEvents.empty()) { @@ -2319,60 +2419,119 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str const auto& deltaObj = (*delta)["delta"]; if (deltaObj.HasMember("reasoning_content") && deltaObj["reasoning_content"].IsString()) { // Reasoning chunk - if (!responsesReasoningInitialized) { + if (!responsesState.reasoningInitialized) { events.emplace_back(serializeReasoningOutputItemAddedEvent(reasoningItemId)); events.emplace_back(serializeReasoningSummaryPartAddedEvent(reasoningItemId)); - responsesReasoningInitialized = true; + responsesState.reasoningInitialized = true; } const std::string reasoningText = deltaObj["reasoning_content"].GetString(); - responsesStreamingReasoningText += reasoningText; + responsesState.reasoningText += reasoningText; events.emplace_back(serializeReasoningSummaryTextDeltaEvent(reasoningItemId, reasoningText)); } else if (deltaObj.HasMember("content") && deltaObj["content"].IsString()) { // Content chunk - close reasoning if it was active, init message if needed - if (responsesReasoningInitialized && !responsesReasoningCompleted) { + if (responsesState.reasoningInitialized && !responsesState.reasoningCompleted) { events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); - responsesReasoningCompleted = true; + responsesState.reasoningCompleted = true; } - const uint64_t msgIdx = responsesReasoningInitialized ? 1 : 0; - if (!responsesMessageInitialized) { + const uint64_t msgIdx = responsesState.reasoningInitialized ? 1 : 0; + if (!responsesState.messageInitialized) { events.emplace_back(serializeOutputItemAddedEvent(outputItemId, msgIdx)); events.emplace_back(serializeContentPartAddedEvent(outputItemId, msgIdx)); - responsesMessageInitialized = true; + responsesState.messageInitialized = true; } const std::string contentText = deltaObj["content"].GetString(); - responsesStreamingOutputText += contentText; + responsesState.outputText += contentText; events.emplace_back(serializeOutputTextDeltaEvent(outputItemId, contentText, msgIdx)); + } else if (deltaObj.HasMember("tool_calls") && deltaObj["tool_calls"].IsArray()) { + // Tool call chunk - close reasoning if active + if (responsesState.reasoningInitialized && !responsesState.reasoningCompleted) { + events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); + events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); + responsesState.reasoningCompleted = true; + } + const auto& toolCallsArr = deltaObj["tool_calls"]; + for (rapidjson::SizeType i = 0; i < toolCallsArr.Size(); ++i) { + const auto& tc = toolCallsArr[i]; + int tcIndex = tc.HasMember("index") ? tc["index"].GetInt() : 0; + // Determine the output index for this tool call + const uint64_t baseIdx = responsesState.reasoningInitialized ? 1 : 0; + const uint64_t tcOutputIdx = baseIdx + static_cast(tcIndex); + // Determine if this is a new tool call (has function name) + bool isNewToolCall = false; + std::string funcName; + std::string tcId; + std::string argDelta; + if (tc.HasMember("function") && tc["function"].IsObject()) { + const auto& funcObj = tc["function"]; + if (funcObj.HasMember("name") && funcObj["name"].IsString()) { + funcName = funcObj["name"].GetString(); + isNewToolCall = true; + } + if (funcObj.HasMember("arguments") && funcObj["arguments"].IsString()) { + argDelta = funcObj["arguments"].GetString(); + } + } + if (tc.HasMember("id") && tc["id"].IsString()) { + tcId = tc["id"].GetString(); + } + if (isNewToolCall) { + // Ensure we have enough entries in our tracking vector + while (static_cast(responsesState.toolCalls.size()) <= tcIndex) { + responsesState.toolCalls.push_back(ToolCall{}); + } + responsesState.toolCalls[tcIndex].id = tcId; + responsesState.toolCalls[tcIndex].name = funcName; + responsesState.toolCalls[tcIndex].arguments = ""; + events.emplace_back(serializeFunctionCallOutputItemAddedEvent(responsesState.toolCalls[tcIndex], tcOutputIdx)); + } + if (!argDelta.empty() && static_cast(responsesState.toolCalls.size()) > tcIndex) { + responsesState.toolCalls[tcIndex].arguments += argDelta; + events.emplace_back(serializeFunctionCallArgumentsDeltaEvent(responsesState.toolCalls[tcIndex].id, argDelta, tcOutputIdx)); + } + } } } // If delta is nullopt, the parser is accumulating tag tokens - skip } else { // No parser - pass through raw text if (!chunkResponse.empty()) { - responsesStreamingOutputText += chunkResponse; + responsesState.outputText += chunkResponse; events.emplace_back(serializeOutputTextDeltaEvent(outputItemId, chunkResponse)); } } if (finishReason != ov::genai::GenerationFinishReason::NONE) { // Close any open reasoning that wasn't closed by content transition - if (responsesReasoningInitialized && !responsesReasoningCompleted) { + if (responsesState.reasoningInitialized && !responsesState.reasoningCompleted) { events.emplace_back(serializeReasoningSummaryTextDoneEvent(reasoningItemId)); events.emplace_back(serializeReasoningSummaryPartDoneEvent(reasoningItemId)); events.emplace_back(serializeReasoningOutputItemDoneEvent(reasoningItemId)); - responsesReasoningCompleted = true; + responsesState.reasoningCompleted = true; + } + // Emit done events for any streaming tool calls + if (!responsesState.toolCalls.empty()) { + const uint64_t baseIdx = responsesState.reasoningInitialized ? 1 : 0; + for (size_t i = 0; i < responsesState.toolCalls.size(); ++i) { + const uint64_t tcOutputIdx = baseIdx + static_cast(i); + events.emplace_back(serializeFunctionCallArgumentsDoneEvent(responsesState.toolCalls[i], tcOutputIdx)); + events.emplace_back(serializeFunctionCallOutputItemDoneEvent(responsesState.toolCalls[i], finishReason, tcOutputIdx)); + } } - const uint64_t msgIdx = responsesReasoningInitialized ? 1 : 0; - // Ensure message item is initialized even if no content was produced - if (!responsesMessageInitialized) { - events.emplace_back(serializeOutputItemAddedEvent(outputItemId, msgIdx)); - events.emplace_back(serializeContentPartAddedEvent(outputItemId, msgIdx)); - responsesMessageInitialized = true; + // Only emit message item if content was produced or no tool calls were generated + if (!responsesState.outputText.empty() || responsesState.toolCalls.empty()) { + const uint64_t msgIdx = (responsesState.reasoningInitialized ? 1 : 0) + responsesState.toolCalls.size(); + if (!responsesState.messageInitialized) { + events.emplace_back(serializeOutputItemAddedEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartAddedEvent(outputItemId, msgIdx)); + responsesState.messageInitialized = true; + } + events.emplace_back(serializeOutputTextDoneEvent(outputItemId, msgIdx)); + events.emplace_back(serializeContentPartDoneEvent(outputItemId, msgIdx)); + events.emplace_back(serializeOutputItemDoneEvent(outputItemId, finishReason, msgIdx)); } - events.emplace_back(serializeOutputTextDoneEvent(outputItemId, msgIdx)); - events.emplace_back(serializeContentPartDoneEvent(outputItemId, msgIdx)); - events.emplace_back(serializeOutputItemDoneEvent(outputItemId, finishReason, msgIdx)); events.emplace_back(serializeResponseCompletedEvent(responseId, createdAt, finishReason)); } @@ -2485,7 +2644,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesFailedEvent(const st const std::string responseId = "resp-" + std::to_string(createdAt); std::vector events; - if (!responsesStreamingInitialized) { + if (!responsesState.initialized) { std::string initEvents = serializeResponsesStreamingInitEvents(); if (!initEvents.empty()) { events.emplace_back(std::move(initEvents)); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index f5eb54dd2c..06743e6420 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -62,6 +62,19 @@ struct CompletionUsageStatistics { // Class that wraps OpenAI request, holds and processes raw JSON, provides methods for serialization and keeps track of usage. // It is used in the calculator. + +// Encapsulates all mutable state accumulated during Responses API streaming. +struct ResponsesStreamingState { + size_t sequenceNumber = 1; + bool initialized = false; + bool reasoningInitialized = false; + bool reasoningCompleted = false; + bool messageInitialized = false; + std::string outputText; + std::string reasoningText; + ToolCalls_t toolCalls; +}; + class OpenAIChatCompletionsHandler { Document& doc; Endpoint endpoint; @@ -71,13 +84,7 @@ class OpenAIChatCompletionsHandler { ov::genai::Tokenizer tokenizer; size_t processedTokens = 0; // tracks overall number of tokens processed by the pipeline bool toolCallsDetectedInStream = false; // tracks whether tool calls were detected in any streaming chunk - size_t responsesStreamingSequenceNumber = 1; - bool responsesStreamingInitialized = false; - std::string responsesStreamingOutputText; - bool responsesReasoningInitialized = false; - bool responsesReasoningCompleted = false; - bool responsesMessageInitialized = false; - std::string responsesStreamingReasoningText; + ResponsesStreamingState responsesState; // Output parser is used to parse chat completions response to extract specific fields like tool calls and reasoning. std::unique_ptr outputParser = nullptr; @@ -128,6 +135,12 @@ class OpenAIChatCompletionsHandler { std::string serializeReasoningOutputItemDoneEvent(const std::string& reasoningItemId); std::string serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, const char* errorCode); + // Function call streaming event serializers + std::string serializeFunctionCallOutputItemAddedEvent(const ToolCall& toolCall, uint64_t outputIndex); + std::string serializeFunctionCallArgumentsDeltaEvent(const std::string& callId, const std::string& delta, uint64_t outputIndex); + std::string serializeFunctionCallArgumentsDoneEvent(const ToolCall& toolCall, uint64_t outputIndex); + std::string serializeFunctionCallOutputItemDoneEvent(const ToolCall& toolCall, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex); + public: OpenAIChatCompletionsHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, ov::genai::Tokenizer tokenizer, const std::string& toolParserName = "", const std::string& reasoningParserName = "") : From 9c155e611227aed5fdf9fd613d6a8e5a4d35f008 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 23 Mar 2026 10:20:31 +0100 Subject: [PATCH 20/24] fix --- .../accuracy/gorilla.patch | 522 +++--------------- src/llm/apis/openai_completions.cpp | 239 ++++---- src/llm/apis/openai_completions.hpp | 25 +- src/test/http_openai_handler_test.cpp | 2 +- 4 files changed, 230 insertions(+), 558 deletions(-) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 1bb4cf984f..8f79651353 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -93,90 +93,27 @@ index 357584f..e45e12c 100644 "store": False, } -diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -index 10f1a08..50890c7 100644 ---- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -@@ -7,6 +7,7 @@ from openai import OpenAI - from overrides import override - from qwen_agent.llm import get_chat_model - import time -+import json - - class QwenAPIHandler(OpenAICompletionsHandler): - """ -@@ -28,8 +29,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): - super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) - self.model_style = ModelStyle.OPENAI_COMPLETIONS - self.client = OpenAI( -- base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", -- api_key=os.getenv("QWEN_API_KEY"), -+ base_url=os.getenv("OPENAI_BASE_URL", "https://localhost:8000/v3"), -+ api_key=os.getenv("QWEN_API_KEY","unused"), - ) - - #### FC methods #### -@@ -45,9 +46,9 @@ class QwenAPIHandler(OpenAICompletionsHandler): - model=self.model_name.replace("-FC", ""), - tools=tools, - parallel_tool_calls=True, -- extra_body={ -- "enable_thinking": True -- }, -+ max_completion_tokens=2048, -+ tool_choice=os.getenv("TOOL_CHOICE", "auto"), -+ extra_body={"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, - stream=True, - stream_options={ - "include_usage": True -@@ -352,4 +353,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): - 'timeout': 1000, - 'max_tokens': 16384 - } -- }) -\ No newline at end of file -+ }) diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py new file mode 100644 -index 0000000..55d480d +index 0000000..f8e19e8 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py -@@ -0,0 +1,270 @@ +@@ -0,0 +1,65 @@ +import json +import os -+import time -+ -+from bfcl_eval.constants.type_mappings import GORILLA_TO_OPENAPI -+from bfcl_eval.model_handler.base_handler import BaseHandler -+from bfcl_eval.constants.enums import ModelStyle -+from bfcl_eval.model_handler.utils import ( -+ convert_to_function_call, -+ convert_to_tool, -+ default_decode_ast_prompting, -+ default_decode_execute_prompting, -+ format_execution_results_prompting, -+ retry_with_backoff, -+ system_prompt_pre_processing_chat_model, -+) -+from openai import OpenAI, RateLimitError -+from openai.types.responses import Response -+ -+ -+class OVMSResponsesHandler(BaseHandler): -+ """Handler for testing OVMS via the OpenAI Responses API (/v3/responses).""" -+ -+ def __init__( -+ self, -+ model_name, -+ temperature, -+ registry_name, -+ is_fc_model, -+ **kwargs, -+ ) -> None: -+ super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) -+ self.model_style = ModelStyle.OPENAI_RESPONSES -+ self.client = OpenAI(**self._build_client_kwargs()) + ++from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler ++from overrides import override ++ ++ ++class OVMSResponsesHandler(OpenAIResponsesHandler): ++ """OVMS variant of OpenAIResponsesHandler. ++ ++ Inherits all behavior and only overrides _build_client_kwargs (OVMS defaults), ++ _query_FC and _query_prompting (OVMS-specific kwargs instead of OpenAI reasoning model logic). ++ """ ++ ++ @override + def _build_client_kwargs(self): + kwargs = {} + @@ -191,42 +128,7 @@ index 0000000..55d480d + + return kwargs + -+ @staticmethod -+ def _substitute_prompt_role(prompts: list[dict]) -> list[dict]: -+ # OVMS responses API accepts "system" role, but following OpenAI convention -+ # we convert to "developer" for compatibility. -+ for prompt in prompts: -+ if prompt["role"] == "system": -+ prompt["role"] = "developer" -+ return prompts -+ -+ def decode_ast(self, result, language, has_tool_call_tag): -+ if self.is_fc_model: -+ decoded_output = [] -+ for invoked_function in result: -+ name = list(invoked_function.keys())[0] -+ params = json.loads(invoked_function[name]) -+ decoded_output.append({name: params}) -+ return decoded_output -+ else: -+ return default_decode_ast_prompting(result, language, has_tool_call_tag) -+ -+ def decode_execute(self, result, has_tool_call_tag): -+ if self.is_fc_model: -+ return convert_to_function_call(result) -+ else: -+ return default_decode_execute_prompting(result, has_tool_call_tag) -+ -+ @retry_with_backoff(error_type=RateLimitError) -+ def generate_with_backoff(self, **kwargs): -+ start_time = time.time() -+ api_response = self.client.responses.create(**kwargs) -+ end_time = time.time() -+ -+ return api_response, end_time - start_time -+ -+ #### FC methods #### -+ ++ @override + def _query_FC(self, inference_data: dict): + message: list[dict] = inference_data["message"] + tools = inference_data["tools"] @@ -250,92 +152,7 @@ index 0000000..55d480d + + return self.generate_with_backoff(**kwargs) + -+ def _pre_query_processing_FC(self, inference_data: dict, test_entry: dict) -> dict: -+ for round_idx in range(len(test_entry["question"])): -+ test_entry["question"][round_idx] = self._substitute_prompt_role( -+ test_entry["question"][round_idx] -+ ) -+ -+ inference_data["message"] = [] -+ -+ return inference_data -+ -+ def _compile_tools(self, inference_data: dict, test_entry: dict) -> dict: -+ functions: list = test_entry["function"] -+ -+ tools = convert_to_tool(functions, GORILLA_TO_OPENAPI, self.model_style) -+ -+ inference_data["tools"] = tools -+ -+ return inference_data -+ -+ def _parse_query_response_FC(self, api_response: Response) -> dict: -+ model_responses = [] -+ tool_call_ids = [] -+ -+ for func_call in api_response.output: -+ if func_call.type == "function_call": -+ model_responses.append({func_call.name: func_call.arguments}) -+ tool_call_ids.append(func_call.call_id) -+ -+ if not model_responses: # If there are no function calls -+ model_responses = api_response.output_text -+ -+ reasoning_content = "" -+ for item in api_response.output: -+ if item.type == "reasoning": -+ for summary in item.summary: -+ reasoning_content += summary.text + "\n" -+ -+ return { -+ "model_responses": model_responses, -+ "model_responses_message_for_chat_history": api_response.output, -+ "tool_call_ids": tool_call_ids, -+ "reasoning_content": reasoning_content, -+ "input_token": api_response.usage.input_tokens, -+ "output_token": api_response.usage.output_tokens, -+ } -+ -+ def add_first_turn_message_FC( -+ self, inference_data: dict, first_turn_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(first_turn_message) -+ return inference_data -+ -+ def _add_next_turn_user_message_FC( -+ self, inference_data: dict, user_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(user_message) -+ return inference_data -+ -+ def _add_assistant_message_FC( -+ self, inference_data: dict, model_response_data: dict -+ ) -> dict: -+ inference_data["message"].extend( -+ model_response_data["model_responses_message_for_chat_history"] -+ ) -+ return inference_data -+ -+ def _add_execution_results_FC( -+ self, -+ inference_data: dict, -+ execution_results: list[str], -+ model_response_data: dict, -+ ) -> dict: -+ for execution_result, tool_call_id in zip( -+ execution_results, model_response_data["tool_call_ids"] -+ ): -+ tool_message = { -+ "type": "function_call_output", -+ "call_id": tool_call_id, -+ "output": execution_result, -+ } -+ inference_data["message"].append(tool_message) -+ -+ return inference_data -+ -+ #### Prompting methods #### -+ ++ @override + def _query_prompting(self, inference_data: dict): + inference_data["inference_input_log"] = {"message": repr(inference_data["message"])} + @@ -347,111 +164,29 @@ index 0000000..55d480d + } + + return self.generate_with_backoff(**kwargs) -+ -+ def _pre_query_processing_prompting(self, test_entry: dict) -> dict: -+ functions: list = test_entry["function"] -+ test_entry_id: str = test_entry["id"] -+ -+ test_entry["question"][0] = system_prompt_pre_processing_chat_model( -+ test_entry["question"][0], functions, test_entry_id -+ ) -+ -+ for round_idx in range(len(test_entry["question"])): -+ test_entry["question"][round_idx] = self._substitute_prompt_role( -+ test_entry["question"][round_idx] -+ ) -+ -+ return {"message": []} -+ -+ def _parse_query_response_prompting(self, api_response: Response) -> dict: -+ reasoning_content = "" -+ for item in api_response.output: -+ if item.type == "reasoning": -+ for summary in item.summary: -+ reasoning_content += summary.text + "\n" -+ -+ return { -+ "model_responses": api_response.output_text, -+ "model_responses_message_for_chat_history": api_response.output, -+ "reasoning_content": reasoning_content, -+ "input_token": api_response.usage.input_tokens, -+ "output_token": api_response.usage.output_tokens, -+ } -+ -+ def add_first_turn_message_prompting( -+ self, inference_data: dict, first_turn_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(first_turn_message) -+ return inference_data -+ -+ def _add_next_turn_user_message_prompting( -+ self, inference_data: dict, user_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(user_message) -+ return inference_data -+ -+ def _add_assistant_message_prompting( -+ self, inference_data: dict, model_response_data: dict -+ ) -> dict: -+ inference_data["message"].extend( -+ model_response_data["model_responses_message_for_chat_history"] -+ ) -+ return inference_data -+ -+ def _add_execution_results_prompting( -+ self, -+ inference_data: dict, -+ execution_results: list[str], -+ model_response_data: dict, -+ ) -> dict: -+ formatted_results_message = format_execution_results_prompting( -+ inference_data, execution_results, model_response_data -+ ) -+ inference_data["message"].append( -+ {"role": "user", "content": formatted_results_message} -+ ) -+ -+ return inference_data diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py new file mode 100644 -index 0000000..89c9a65 +index 0000000..0313452 --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py -@@ -0,0 +1,313 @@ +@@ -0,0 +1,158 @@ +import json +import os -+import time -+ -+from bfcl_eval.constants.type_mappings import GORILLA_TO_OPENAPI -+from bfcl_eval.model_handler.base_handler import BaseHandler -+from bfcl_eval.constants.enums import ModelStyle -+from bfcl_eval.model_handler.utils import ( -+ convert_to_function_call, -+ convert_to_tool, -+ default_decode_ast_prompting, -+ default_decode_execute_prompting, -+ format_execution_results_prompting, -+ retry_with_backoff, -+ system_prompt_pre_processing_chat_model, -+) -+from openai import OpenAI, RateLimitError -+ -+ -+class OVMSResponsesStreamHandler(BaseHandler): -+ """Handler for testing OVMS via the OpenAI Responses API with streaming (/v3/responses, stream=True).""" -+ -+ def __init__( -+ self, -+ model_name, -+ temperature, -+ registry_name, -+ is_fc_model, -+ **kwargs, -+ ) -> None: -+ super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) -+ self.model_style = ModelStyle.OPENAI_RESPONSES -+ self.client = OpenAI(**self._build_client_kwargs()) + ++from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler ++from overrides import override ++ ++ ++class OVMSResponsesStreamHandler(OpenAIResponsesHandler): ++ """Streaming variant of OpenAIResponsesHandler. ++ ++ Inherits all behavior from OpenAIResponsesHandler and only overrides ++ _build_client_kwargs (OVMS defaults), _query_FC, _query_prompting ++ (to add stream=True), and the two _parse_query_response methods ++ (to aggregate SSE events). ++ """ ++ ++ @override + def _build_client_kwargs(self): + kwargs = {} + @@ -467,43 +202,11 @@ index 0000000..89c9a65 + return kwargs + + @staticmethod -+ def _substitute_prompt_role(prompts: list[dict]) -> list[dict]: -+ for prompt in prompts: -+ if prompt["role"] == "system": -+ prompt["role"] = "developer" -+ return prompts -+ -+ def decode_ast(self, result, language, has_tool_call_tag): -+ if self.is_fc_model: -+ decoded_output = [] -+ for invoked_function in result: -+ name = list(invoked_function.keys())[0] -+ params = json.loads(invoked_function[name]) -+ decoded_output.append({name: params}) -+ return decoded_output -+ else: -+ return default_decode_ast_prompting(result, language, has_tool_call_tag) -+ -+ def decode_execute(self, result, has_tool_call_tag): -+ if self.is_fc_model: -+ return convert_to_function_call(result) -+ else: -+ return default_decode_execute_prompting(result, has_tool_call_tag) -+ -+ @retry_with_backoff(error_type=RateLimitError) -+ def generate_with_backoff(self, **kwargs): -+ start_time = time.time() -+ api_response = self.client.responses.create(**kwargs) -+ end_time = time.time() -+ -+ return api_response, end_time - start_time -+ -+ @staticmethod + def _parse_stream(stream) -> dict: + """Parse responses API SSE stream and return aggregated results.""" + text_content = "" + reasoning_content = "" -+ tool_calls = {} # keyed by call_id ++ tool_calls = {} # keyed by item_id + usage = {"input_tokens": 0, "output_tokens": 0} + output_items = [] # final output items from response.completed + @@ -550,6 +253,7 @@ index 0000000..89c9a65 + + #### FC methods #### + ++ @override + def _query_FC(self, inference_data: dict): + message: list[dict] = inference_data["message"] + tools = inference_data["tools"] @@ -574,25 +278,7 @@ index 0000000..89c9a65 + + return self.generate_with_backoff(**kwargs) + -+ def _pre_query_processing_FC(self, inference_data: dict, test_entry: dict) -> dict: -+ for round_idx in range(len(test_entry["question"])): -+ test_entry["question"][round_idx] = self._substitute_prompt_role( -+ test_entry["question"][round_idx] -+ ) -+ -+ inference_data["message"] = [] -+ -+ return inference_data -+ -+ def _compile_tools(self, inference_data: dict, test_entry: dict) -> dict: -+ functions: list = test_entry["function"] -+ -+ tools = convert_to_tool(functions, GORILLA_TO_OPENAPI, self.model_style) -+ -+ inference_data["tools"] = tools -+ -+ return inference_data -+ ++ @override + def _parse_query_response_FC(self, api_response) -> dict: + parsed = self._parse_stream(api_response) + @@ -606,58 +292,18 @@ index 0000000..89c9a65 + if not model_responses: + model_responses = parsed["text"] + -+ # Use the output_items from the completed event for chat history -+ output_items = parsed["output_items"] -+ + return { + "model_responses": model_responses, -+ "model_responses_message_for_chat_history": output_items, ++ "model_responses_message_for_chat_history": parsed["output_items"], + "tool_call_ids": tool_call_ids, + "reasoning_content": parsed["reasoning"], + "input_token": parsed["usage"]["input_tokens"], + "output_token": parsed["usage"]["output_tokens"], + } + -+ def add_first_turn_message_FC( -+ self, inference_data: dict, first_turn_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(first_turn_message) -+ return inference_data -+ -+ def _add_next_turn_user_message_FC( -+ self, inference_data: dict, user_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(user_message) -+ return inference_data -+ -+ def _add_assistant_message_FC( -+ self, inference_data: dict, model_response_data: dict -+ ) -> dict: -+ inference_data["message"].extend( -+ model_response_data["model_responses_message_for_chat_history"] -+ ) -+ return inference_data -+ -+ def _add_execution_results_FC( -+ self, -+ inference_data: dict, -+ execution_results: list[str], -+ model_response_data: dict, -+ ) -> dict: -+ for execution_result, tool_call_id in zip( -+ execution_results, model_response_data["tool_call_ids"] -+ ): -+ tool_message = { -+ "type": "function_call_output", -+ "call_id": tool_call_id, -+ "output": execution_result, -+ } -+ inference_data["message"].append(tool_message) -+ -+ return inference_data -+ + #### Prompting methods #### + ++ @override + def _query_prompting(self, inference_data: dict): + inference_data["inference_input_log"] = {"message": repr(inference_data["message"])} + @@ -671,21 +317,7 @@ index 0000000..89c9a65 + + return self.generate_with_backoff(**kwargs) + -+ def _pre_query_processing_prompting(self, test_entry: dict) -> dict: -+ functions: list = test_entry["function"] -+ test_entry_id: str = test_entry["id"] -+ -+ test_entry["question"][0] = system_prompt_pre_processing_chat_model( -+ test_entry["question"][0], functions, test_entry_id -+ ) -+ -+ for round_idx in range(len(test_entry["question"])): -+ test_entry["question"][round_idx] = self._substitute_prompt_role( -+ test_entry["question"][round_idx] -+ ) -+ -+ return {"message": []} -+ ++ @override + def _parse_query_response_prompting(self, api_response) -> dict: + parsed = self._parse_stream(api_response) + @@ -696,38 +328,46 @@ index 0000000..89c9a65 + "input_token": parsed["usage"]["input_tokens"], + "output_token": parsed["usage"]["output_tokens"], + } -+ -+ def add_first_turn_message_prompting( -+ self, inference_data: dict, first_turn_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(first_turn_message) -+ return inference_data -+ -+ def _add_next_turn_user_message_prompting( -+ self, inference_data: dict, user_message: list[dict] -+ ) -> dict: -+ inference_data["message"].extend(user_message) -+ return inference_data -+ -+ def _add_assistant_message_prompting( -+ self, inference_data: dict, model_response_data: dict -+ ) -> dict: -+ inference_data["message"].extend( -+ model_response_data["model_responses_message_for_chat_history"] -+ ) -+ return inference_data -+ -+ def _add_execution_results_prompting( -+ self, -+ inference_data: dict, -+ execution_results: list[str], -+ model_response_data: dict, -+ ) -> dict: -+ formatted_results_message = format_execution_results_prompting( -+ inference_data, execution_results, model_response_data -+ ) -+ inference_data["message"].append( -+ {"role": "user", "content": formatted_results_message} -+ ) -+ -+ return inference_data +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py +index 10f1a08..50890c7 100644 +--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py +@@ -7,6 +7,7 @@ from openai import OpenAI + from overrides import override + from qwen_agent.llm import get_chat_model + import time ++import json + + class QwenAPIHandler(OpenAICompletionsHandler): + """ +@@ -28,8 +29,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): + super().__init__(model_name, temperature, registry_name, is_fc_model, **kwargs) + self.model_style = ModelStyle.OPENAI_COMPLETIONS + self.client = OpenAI( +- base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", +- api_key=os.getenv("QWEN_API_KEY"), ++ base_url=os.getenv("OPENAI_BASE_URL", "https://localhost:8000/v3"), ++ api_key=os.getenv("QWEN_API_KEY","unused"), + ) + + #### FC methods #### +@@ -45,9 +46,9 @@ class QwenAPIHandler(OpenAICompletionsHandler): + model=self.model_name.replace("-FC", ""), + tools=tools, + parallel_tool_calls=True, +- extra_body={ +- "enable_thinking": True +- }, ++ max_completion_tokens=2048, ++ tool_choice=os.getenv("TOOL_CHOICE", "auto"), ++ extra_body={"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, + stream=True, + stream_options={ + "include_usage": True +@@ -352,4 +353,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): + 'timeout': 1000, + 'max_tokens': 16384 + } +- }) +\ No newline at end of file ++ }) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index a9af75fedf..608a5381fd 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -45,6 +45,8 @@ using namespace rapidjson; namespace ovms { constexpr size_t DEFAULT_MAX_STOP_WORDS = 16; // same as deep-seek +constexpr std::string_view BASE64_PREFIX = "base64,"; +constexpr int64_t MAX_IMAGE_SIZE_BYTES = 20000000; // 20MB namespace { @@ -130,8 +132,8 @@ void OpenAIChatCompletionsHandler::serializeResponsesTools(Writer& } void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, - const char* status, const std::string& fullOutputText, bool includeUsage, - const char* incompleteReason, const char* errorMessage, const char* errorCode) const { + const std::string& status, const std::string& fullOutputText, bool includeUsage, + const std::optional& incompleteReason, const std::optional& errorMessage, ResponsesErrorCode errorCode) const { writer.StartObject(); writer.String("id"); writer.String(responseId.c_str()); @@ -139,25 +141,25 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(std::chrono::system_clock::now().time_since_epoch()).count(); writer.String("completed_at"); writer.Int64(completedAt); } - if (incompleteReason != nullptr) { + if (incompleteReason.has_value()) { writer.String("incomplete_details"); writer.StartObject(); writer.String("reason"); - writer.String(incompleteReason); + writer.String(incompleteReason.value().c_str()); writer.EndObject(); } writer.String("error"); - if (errorMessage != nullptr) { + if (errorMessage.has_value()) { writer.StartObject(); writer.String("code"); - writer.String(errorCode != nullptr ? errorCode : "server_error"); + writer.String(responsesErrorCodeToString(errorCode)); writer.String("message"); - writer.String(errorMessage); + writer.String(errorMessage.value().c_str()); writer.EndObject(); } else { writer.Null(); @@ -165,18 +167,17 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(request.temperature.value())); - } else { - writer.Double(1.0); } writer.String("text"); writer.StartObject(); @@ -188,11 +189,9 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(request.topP.value())); - } else { - writer.Double(1.0); } writer.String("truncation"); writer.String("disabled"); @@ -234,7 +233,7 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer& writer, const std::string& outputItemId, - const std::string& text, const char* status, bool withContent) { + const std::string& text, const std::string& status) { writer.StartObject(); writer.String("id"); writer.String(outputItemId.c_str()); @@ -294,17 +287,17 @@ void OpenAIChatCompletionsHandler::serializeResponsesOutputItem(Writer& writer, const std::string& text) { +void OpenAIChatCompletionsHandler::serializeOutputTextPart(Writer& writer, const std::string& text) { writer.StartObject(); writer.String("type"); writer.String("output_text"); @@ -319,7 +312,7 @@ void OpenAIChatCompletionsHandler::serializeResponsesPart(Writer& std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const std::vector& parsedOutputs, ov::genai::GenerationFinishReason finishReason) const { const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); - const char* responseStatus = isIncomplete ? "incomplete" : "completed"; + const std::string responseStatus = isIncomplete ? "incomplete" : "completed"; const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); const auto completedAt = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); const std::string responseId = "resp-" + std::to_string(createdAt); @@ -349,18 +342,17 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.String("model"); writer.String(request.model.c_str()); writer.String("status"); - writer.String(responseStatus); + writer.String(responseStatus.c_str()); writer.String("parallel_tool_calls"); writer.Bool(false); // TODO: previous_response_id not supported writer.String("store"); writer.Bool(true); - writer.String("temperature"); + // TODO: temperature/top_p are only included when explicitly provided in the request if (request.temperature.has_value()) { + writer.String("temperature"); writer.Double(static_cast(request.temperature.value())); - } else { - writer.Double(1.0); } writer.String("text"); writer.StartObject(); @@ -372,11 +364,9 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.EndObject(); serializeResponsesToolChoice(writer); serializeResponsesTools(writer); - writer.String("top_p"); if (request.topP.has_value()) { + writer.String("top_p"); writer.Double(static_cast(request.topP.value())); - } else { - writer.Double(1.0); } writer.String("truncation"); writer.String("disabled"); @@ -423,7 +413,7 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.String("type"); writer.String("function_call"); writer.String("status"); - writer.String(responseStatus); + writer.String(responseStatus.c_str()); writer.String("call_id"); writer.String(toolCall.id.c_str()); writer.String("name"); @@ -446,10 +436,10 @@ std::string OpenAIChatCompletionsHandler::serializeResponsesUnaryResponse(const writer.String("role"); writer.String("assistant"); writer.String("status"); - writer.String(responseStatus); + writer.String(responseStatus.c_str()); writer.String("content"); writer.StartArray(); - serializeResponsesPart(writer, parsedOutput.content); + serializeOutputTextPart(writer, parsedOutput.content); writer.EndArray(); writer.EndObject(); } @@ -714,13 +704,12 @@ absl::Status OpenAIChatCompletionsHandler::parseResponsesInput(std::optionalvalue.GetArray()[i]; if (!obj.IsObject()) return absl::InvalidArgumentError("Tool is not a JSON object"); - rapidjson::Value* functionObj = nullptr; rapidjson::Value* parametersValue = nullptr; - const char* functionNameCStr = nullptr; + std::string functionName; auto functionIt = obj.FindMember("function"); if (functionIt != obj.MemberEnd()) { if (!functionIt->value.IsObject()) { return absl::InvalidArgumentError("Function is not a valid JSON object"); } - functionObj = &functionIt->value; - auto nameIt = functionObj->GetObject().FindMember("name"); - if (nameIt == functionObj->GetObject().MemberEnd() || !nameIt->value.IsString()) { + auto& functionObj = functionIt->value; + auto nameIt = functionObj.GetObject().FindMember("name"); + if (nameIt == functionObj.GetObject().MemberEnd() || !nameIt->value.IsString()) { return absl::InvalidArgumentError("Function object does not contain a valid name field"); } - functionNameCStr = nameIt->value.GetString(); - auto parametersIt = functionObj->GetObject().FindMember("parameters"); - if (parametersIt != functionObj->GetObject().MemberEnd()) { + functionName = nameIt->value.GetString(); + auto parametersIt = functionObj.GetObject().FindMember("parameters"); + if (parametersIt != functionObj.GetObject().MemberEnd()) { parametersValue = ¶metersIt->value; } } else { @@ -1043,7 +1065,7 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { if (nameIt == obj.MemberEnd() || !nameIt->value.IsString()) { return absl::InvalidArgumentError("Function object does not contain a valid name field"); } - functionNameCStr = nameIt->value.GetString(); + functionName = nameIt->value.GetString(); auto parametersIt = obj.FindMember("parameters"); if (parametersIt != obj.MemberEnd()) { @@ -1051,7 +1073,6 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { } } - std::string functionName = functionNameCStr; // If tool_choice is set to "auto", we keep all tools // If tool_choice is set to a specific function name, we keep only that tool if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { @@ -1066,10 +1087,8 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { if (!parametersValue->IsObject()) { return absl::InvalidArgumentError("Function parameters are not a valid JSON object"); } - // now we want to insert to a mapping of - // tool name -> tool schema representations struct // Dump parameters object to string since this is the schema format expected by GenAI - // Keep the rapidjson::Value object as well to avoid re-parsing in outputParsers + // Keep the rapidjson::Value pointer as well to avoid re-parsing in outputParsers rapidjson::StringBuffer buffer; rapidjson::Writer writer(buffer); parametersValue->Accept(writer); @@ -1647,14 +1666,14 @@ absl::Status OpenAIChatCompletionsHandler::parseCommonPart(std::optionalrequest.logprobschat || this->request.logprobs) { jsonResponse.StartObject("logprobs"); if (endpoint == Endpoint::CHAT_COMPLETIONS) { @@ -1863,11 +1882,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect jsonResponse.UsageObject(usage); - // TODO - // id: string; A unique identifier for the chat completion. + // TODO: id: string; A unique identifier for the chat completion. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. // finish response object @@ -1931,11 +1948,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco jsonResponse.UsageObject(usage); - // TODO - // id: string; A unique identifier for the chat completion. + // TODO: id: string; A unique identifier for the chat completion. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. // finish response object @@ -1948,21 +1963,29 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); if (endpoint == Endpoint::RESPONSES) { + // Usage is already correctly set from perf_metrics above — no need for updateUsage. std::vector parsedOutputs; for (const std::string& text : results.texts) { - auto result = tokenizer.encode(text); - auto& input_ids = result.input_ids; - if (input_ids.get_shape().size() != 2) - throw std::runtime_error("input_ids should have 2 dimensions"); - if (input_ids.get_shape()[0] != 1) - throw std::runtime_error("input_ids should have 1 batch size"); - if (input_ids.get_element_type() != ov::element::i64) - throw std::runtime_error("input_ids should have i64 element type"); - - int64_t* input_ids_data = reinterpret_cast(input_ids.data()); - std::vector generatedTokens(input_ids_data, input_ids_data + input_ids.get_shape()[1]); - updateUsage(usage, generatedTokens, request.echo); - parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens)); + if (outputParser != nullptr) { + // Same workaround as in chat completions, line part + auto result = tokenizer.encode(text); + auto& input_ids = result.input_ids; + if (input_ids.get_shape().size() != 2) + throw std::runtime_error("input_ids should have 2 dimensions"); + if (input_ids.get_shape()[0] != 1) + throw std::runtime_error("input_ids should have 1 batch size"); + if (input_ids.get_element_type() != ov::element::i64) + throw std::runtime_error("input_ids should have i64 element type"); + + int64_t* inputIdsData = reinterpret_cast(input_ids.data()); + std::vector generatedTokens(inputIdsData, inputIdsData + input_ids.get_shape()[1]); + parsedOutputs.push_back(parseOutputIfNeeded(generatedTokens)); + } else { + // Fast path: no output parser, use decoded text directly. + ParsedOutput output; + output.content = text; + parsedOutputs.push_back(std::move(output)); + } } return serializeResponsesUnaryResponse(parsedOutputs); } @@ -2000,7 +2023,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); - // logprobs: object/null; Log probability information for the choice. TODO + // TODO: logprobs: object/null; Log probability information for the choice. if (endpoint == Endpoint::CHAT_COMPLETIONS) { jsonResponse.MessageObject(parsedOutput); @@ -2029,11 +2052,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD jsonResponse.UsageObject(usage); - // TODO - // id: string; A unique identifier for the chat completion. + // TODO: id: string; A unique identifier for the chat completion. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. // finish response object @@ -2094,7 +2115,7 @@ std::string OpenAIChatCompletionsHandler::serializeOutputItemAddedEvent(const st writer.String("output_index"); writer.Uint64(outputIndex); writer.String("item"); - serializeResponsesOutputItem(writer, outputItemId, "", "in_progress", false); + serializeResponsesOutputItem(writer, outputItemId, "", "in_progress"); writer.EndObject(); return buffer.GetString(); } @@ -2105,7 +2126,7 @@ std::string OpenAIChatCompletionsHandler::serializeContentPartAddedEvent(const s writeEventHeader(writer, "response.content_part.added"); writeContentLocation(writer, outputItemId, outputIndex); writer.String("part"); - serializeResponsesPart(writer, ""); + serializeOutputTextPart(writer, ""); writer.EndObject(); return buffer.GetString(); } @@ -2140,29 +2161,29 @@ std::string OpenAIChatCompletionsHandler::serializeContentPartDoneEvent(const st writeEventHeader(writer, "response.content_part.done"); writeContentLocation(writer, outputItemId, outputIndex); writer.String("part"); - serializeResponsesPart(writer, responsesState.outputText); + serializeOutputTextPart(writer, responsesState.outputText); writer.EndObject(); return buffer.GetString(); } std::string OpenAIChatCompletionsHandler::serializeOutputItemDoneEvent(const std::string& outputItemId, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex) { - const char* itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; + const std::string itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; StringBuffer buffer; Writer writer(buffer); writeEventHeader(writer, "response.output_item.done"); writer.String("output_index"); writer.Uint64(outputIndex); writer.String("item"); - serializeResponsesOutputItem(writer, outputItemId, responsesState.outputText, itemStatus, true); + serializeResponsesOutputItem(writer, outputItemId, responsesState.outputText, itemStatus); writer.EndObject(); return buffer.GetString(); } std::string OpenAIChatCompletionsHandler::serializeResponseCompletedEvent(const std::string& responseId, int64_t createdAt, ov::genai::GenerationFinishReason finishReason) { const bool isIncomplete = (finishReason == ov::genai::GenerationFinishReason::LENGTH); - const char* responseStatus = isIncomplete ? "incomplete" : "completed"; + const std::string responseStatus = isIncomplete ? "incomplete" : "completed"; const char* eventType = isIncomplete ? "response.incomplete" : "response.completed"; - const char* incompleteReason = isIncomplete ? "max_tokens" : nullptr; + std::optional incompleteReason = isIncomplete ? std::optional("max_tokens") : std::nullopt; StringBuffer buffer; Writer writer(buffer); writeEventHeader(writer, eventType); @@ -2172,13 +2193,13 @@ std::string OpenAIChatCompletionsHandler::serializeResponseCompletedEvent(const return buffer.GetString(); } -std::string OpenAIChatCompletionsHandler::serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, const char* errorCode) { +std::string OpenAIChatCompletionsHandler::serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, ResponsesErrorCode errorCode) { StringBuffer buffer; Writer writer(buffer); writeEventHeader(writer, "response.failed"); writer.String("response"); serializeResponsesResponseObject(writer, responseId, createdAt, "failed", responsesState.outputText, false, - nullptr, errorMessage.c_str(), errorCode); + std::nullopt, errorMessage, errorCode); writer.EndObject(); return buffer.GetString(); } @@ -2341,7 +2362,7 @@ std::string OpenAIChatCompletionsHandler::serializeFunctionCallArgumentsDoneEven } std::string OpenAIChatCompletionsHandler::serializeFunctionCallOutputItemDoneEvent(const ToolCall& toolCall, ov::genai::GenerationFinishReason finishReason, uint64_t outputIndex) { - const char* itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; + const std::string itemStatus = (finishReason == ov::genai::GenerationFinishReason::LENGTH) ? "incomplete" : "completed"; StringBuffer buffer; Writer writer(buffer); writeEventHeader(writer, "response.output_item.done"); @@ -2354,7 +2375,7 @@ std::string OpenAIChatCompletionsHandler::serializeFunctionCallOutputItemDoneEve writer.String("type"); writer.String("function_call"); writer.String("status"); - writer.String(itemStatus); + writer.String(itemStatus.c_str()); writer.String("call_id"); writer.String(toolCall.id.c_str()); writer.String("name"); @@ -2567,7 +2588,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str // null - natural scenario when the generation has not completed yet // index: integer; Choice index, only n=1 supported anyway choice.AddMember("index", 0, allocator); - // logprobs: object/null; Log probability information for the choice. TODO + // TODO: logprobs: object/null; Log probability information for the choice. choice.AddMember("logprobs", Value(), allocator); if (endpoint == Endpoint::CHAT_COMPLETIONS) { if (outputParser != nullptr) { @@ -2626,11 +2647,9 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str doc.AddMember("usage", Value(), allocator); } - // TODO - // id: string; A unique identifier for the chat completion. Each chunk has the same ID. + // TODO: id: string; A unique identifier for the chat completion. Each chunk has the same ID. - // TODO - // system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. + // TODO: system_fingerprint: string; This fingerprint represents the backend configuration that the model runs with. // Can be used in conjunction with the seed request parameter to understand when backend changes have been made that might impact determinism. StringBuffer buffer; @@ -2639,7 +2658,7 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str return buffer.GetString(); } -std::string OpenAIChatCompletionsHandler::serializeResponsesFailedEvent(const std::string& errorMessage, const char* errorCode) { +std::string OpenAIChatCompletionsHandler::serializeResponsesFailedEvent(const std::string& errorMessage, ResponsesErrorCode errorCode) { const auto createdAt = std::chrono::duration_cast(created.time_since_epoch()).count(); const std::string responseId = "resp-" + std::to_string(createdAt); diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 06743e6420..2a340b4ca8 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -51,6 +51,19 @@ enum class Endpoint { TOKENIZE, }; +enum class ResponsesErrorCode { + SERVER_ERROR, + INVALID_PROMPT, +}; + +inline const char* responsesErrorCodeToString(ResponsesErrorCode code) { + switch (code) { + case ResponsesErrorCode::SERVER_ERROR: return "server_error"; + case ResponsesErrorCode::INVALID_PROMPT: return "invalid_prompt"; + default: return "server_error"; + } +} + struct CompletionUsageStatistics { size_t promptTokens = 0; size_t completionTokens = 0; @@ -102,11 +115,11 @@ class OpenAIChatCompletionsHandler { void serializeResponsesToolChoice(Writer& writer) const; void serializeResponsesTools(Writer& writer) const; void serializeResponsesResponseObject(Writer& writer, const std::string& responseId, int64_t createdAt, - const char* status, const std::string& fullOutputText, bool includeUsage, - const char* incompleteReason = nullptr, const char* errorMessage = nullptr, const char* errorCode = nullptr) const; + const std::string& status, const std::string& fullOutputText, bool includeUsage, + const std::optional& incompleteReason = std::nullopt, const std::optional& errorMessage = std::nullopt, ResponsesErrorCode errorCode = ResponsesErrorCode::SERVER_ERROR) const; static void serializeResponsesOutputItem(Writer& writer, const std::string& outputItemId, - const std::string& text, const char* status, bool withContent); - static void serializeResponsesPart(Writer& writer, const std::string& text); + const std::string& text, const std::string& status); + static void serializeOutputTextPart(Writer& writer, const std::string& text); std::string serializeResponsesUnaryResponse(const std::vector& parsedOutputs, ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP) const; @@ -133,7 +146,7 @@ class OpenAIChatCompletionsHandler { std::string serializeReasoningSummaryTextDoneEvent(const std::string& reasoningItemId); std::string serializeReasoningSummaryPartDoneEvent(const std::string& reasoningItemId); std::string serializeReasoningOutputItemDoneEvent(const std::string& reasoningItemId); - std::string serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, const char* errorCode); + std::string serializeResponseFailedEventBody(const std::string& responseId, int64_t createdAt, const std::string& errorMessage, ResponsesErrorCode errorCode); // Function call streaming event serializers std::string serializeFunctionCallOutputItemAddedEvent(const ToolCall& toolCall, uint64_t outputIndex); @@ -191,6 +204,6 @@ class OpenAIChatCompletionsHandler { std::string serializeStreamingUsageChunk(); std::string serializeStreamingHandshakeChunk(); std::string serializeResponsesStreamingInitEvents(); - std::string serializeResponsesFailedEvent(const std::string& errorMessage, const char* errorCode = "server_error"); + std::string serializeResponsesFailedEvent(const std::string& errorMessage, ResponsesErrorCode errorCode = ResponsesErrorCode::SERVER_ERROR); }; } // namespace ovms diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 9e6d8cff88..8c4cab90a8 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1401,7 +1401,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeResponsesFailedEventWithCustomErro std::optional maxModelLength; ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Invalid prompt content", "invalid_prompt"); + std::string failedEvent = apiHandler->serializeResponsesFailedEvent("Invalid prompt content", ovms::ResponsesErrorCode::INVALID_PROMPT); ASSERT_NE(failedEvent.find("\"code\":\"invalid_prompt\""), std::string::npos) << failedEvent; ASSERT_NE(failedEvent.find("\"message\":\"Invalid prompt content\""), std::string::npos) << failedEvent; From 2dbb597344ea7ac0c9b975c3d3571b253617ea92 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 23 Mar 2026 11:25:55 +0100 Subject: [PATCH 21/24] update patch --- .../accuracy/gorilla.patch | 171 ++++++++---------- 1 file changed, 79 insertions(+), 92 deletions(-) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 8f79651353..17bbeddfce 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -1,17 +1,16 @@ diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -index bb625d2..d06483d 100644 +index bb625d2..64c01de 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -@@ -24,6 +24,8 @@ from bfcl_eval.model_handler.api_inference.openai_completion import ( +@@ -24,6 +24,7 @@ from bfcl_eval.model_handler.api_inference.openai_completion import ( OpenAICompletionsHandler, ) from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler -+from bfcl_eval.model_handler.api_inference.ovms_response import OVMSResponsesHandler +from bfcl_eval.model_handler.api_inference.ovms_response_stream import OVMSResponsesStreamHandler from bfcl_eval.model_handler.api_inference.qwen import ( QwenAgentNoThinkHandler, QwenAgentThinkHandler, -@@ -2153,6 +2155,54 @@ third_party_inference_model_map = { +@@ -2153,6 +2154,54 @@ third_party_inference_model_map = { is_fc_model=True, underscore_to_dot=True, ), @@ -45,7 +44,7 @@ index bb625d2..d06483d 100644 + url="http://localhost:8000/v3", + org="ovms", + license="apache-2.0", -+ model_handler=OVMSResponsesHandler, ++ model_handler=OpenAIResponsesHandler, + input_price=None, + output_price=None, + is_fc_model=True, @@ -93,83 +92,85 @@ index 357584f..e45e12c 100644 "store": False, } -diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py -new file mode 100644 -index 0000000..f8e19e8 ---- /dev/null -+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response.py -@@ -0,0 +1,65 @@ -+import json -+import os -+ -+from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler -+from overrides import override -+ -+ -+class OVMSResponsesHandler(OpenAIResponsesHandler): -+ """OVMS variant of OpenAIResponsesHandler. -+ -+ Inherits all behavior and only overrides _build_client_kwargs (OVMS defaults), -+ _query_FC and _query_prompting (OVMS-specific kwargs instead of OpenAI reasoning model logic). -+ """ -+ -+ @override -+ def _build_client_kwargs(self): -+ kwargs = {} -+ -+ if api_key := os.getenv("OPENAI_API_KEY", "unused"): -+ kwargs["api_key"] = api_key -+ -+ if base_url := os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v3"): -+ kwargs["base_url"] = base_url -+ -+ if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): -+ kwargs["default_headers"] = json.loads(headers_env) -+ -+ return kwargs -+ -+ @override -+ def _query_FC(self, inference_data: dict): -+ message: list[dict] = inference_data["message"] -+ tools = inference_data["tools"] -+ -+ inference_data["inference_input_log"] = { -+ "message": repr(message), -+ "tools": tools, -+ } -+ -+ kwargs = { -+ "input": message, -+ "model": self.model_name, -+ "temperature": self.temperature, +diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py +index 0953fdd..7f6919f 100644 +--- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py ++++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_response.py +@@ -38,10 +38,10 @@ class OpenAIResponsesHandler(BaseHandler): + + kwargs = {} + +- if api_key := os.getenv("OPENAI_API_KEY"): ++ if api_key := os.getenv("OPENAI_API_KEY","unused"): + kwargs["api_key"] = api_key + +- if base_url := os.getenv("OPENAI_BASE_URL"): ++ if base_url := os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"): + kwargs["base_url"] = base_url + + if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): +@@ -99,25 +99,12 @@ class OpenAIResponsesHandler(BaseHandler): + kwargs = { + "input": message, + "model": self.model_name, +- "store": False, +- "include": ["reasoning.encrypted_content"], +- "reasoning": {"summary": "auto"}, + "temperature": self.temperature, + "max_output_tokens": 2048, + "tool_choice": os.getenv("TOOL_CHOICE", "auto"), + "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, -+ } -+ -+ if len(tools) > 0: -+ kwargs["tools"] = tools -+ -+ return self.generate_with_backoff(**kwargs) -+ -+ @override -+ def _query_prompting(self, inference_data: dict): -+ inference_data["inference_input_log"] = {"message": repr(inference_data["message"])} -+ -+ kwargs = { -+ "input": inference_data["message"], -+ "model": self.model_name, -+ "temperature": self.temperature, + } + +- # OpenAI reasoning models don't support temperature parameter +- if ( +- "o3" in self.model_name +- or "o4-mini" in self.model_name +- or "gpt-5" in self.model_name +- ): +- del kwargs["temperature"] +- +- # Non-reasoning models don't support reasoning parameter +- else: +- del kwargs["reasoning"] +- del kwargs["include"] +- + if len(tools) > 0: + kwargs["tools"] = tools + +@@ -218,25 +205,10 @@ class OpenAIResponsesHandler(BaseHandler): + kwargs = { + "input": inference_data["message"], + "model": self.model_name, +- "store": False, +- "include": ["reasoning.encrypted_content"], +- "reasoning": {"summary": "auto"}, + "temperature": self.temperature, + "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, -+ } -+ -+ return self.generate_with_backoff(**kwargs) + } + +- # OpenAI reasoning models don't support temperature parameter +- if ( +- "o3" in self.model_name +- or "o4-mini" in self.model_name +- or "gpt-5" in self.model_name +- ): +- del kwargs["temperature"] +- +- # Non-reasoning models don't support reasoning parameter +- else: +- del kwargs["reasoning"] +- del kwargs["include"] +- + return self.generate_with_backoff(**kwargs) + + def _pre_query_processing_prompting(self, test_entry: dict) -> dict: diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py new file mode 100644 -index 0000000..0313452 +index 0000000..bc5ef1e --- /dev/null +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py -@@ -0,0 +1,158 @@ +@@ -0,0 +1,144 @@ +import json +import os + @@ -180,26 +181,12 @@ index 0000000..0313452 +class OVMSResponsesStreamHandler(OpenAIResponsesHandler): + """Streaming variant of OpenAIResponsesHandler. + -+ Inherits all behavior from OpenAIResponsesHandler and only overrides -+ _build_client_kwargs (OVMS defaults), _query_FC, _query_prompting -+ (to add stream=True), and the two _parse_query_response methods -+ (to aggregate SSE events). ++ Inherits all behavior from the (patched) OpenAIResponsesHandler and only overrides ++ _query_FC, _query_prompting (to add stream=True), and the two ++ _parse_query_response methods (to aggregate SSE events). + """ + -+ @override -+ def _build_client_kwargs(self): -+ kwargs = {} -+ -+ if api_key := os.getenv("OPENAI_API_KEY", "unused"): -+ kwargs["api_key"] = api_key -+ -+ if base_url := os.getenv("OPENAI_BASE_URL", "http://localhost:8000/v3"): -+ kwargs["base_url"] = base_url -+ -+ if headers_env := os.getenv("OPENAI_DEFAULT_HEADERS"): -+ kwargs["default_headers"] = json.loads(headers_env) -+ -+ return kwargs ++ @staticmethod + + @staticmethod + def _parse_stream(stream) -> dict: From b2b69fbd6130c1217d659c6fcf81945fc61a2d56 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 23 Mar 2026 11:47:11 +0100 Subject: [PATCH 22/24] style --- src/llm/apis/openai_completions.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index 2a340b4ca8..2db53f80a9 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -58,9 +58,12 @@ enum class ResponsesErrorCode { inline const char* responsesErrorCodeToString(ResponsesErrorCode code) { switch (code) { - case ResponsesErrorCode::SERVER_ERROR: return "server_error"; - case ResponsesErrorCode::INVALID_PROMPT: return "invalid_prompt"; - default: return "server_error"; + case ResponsesErrorCode::SERVER_ERROR: + return "server_error"; + case ResponsesErrorCode::INVALID_PROMPT: + return "invalid_prompt"; + default: + return "server_error"; } } From 80491434de00cfa5b41add9a65f6e203eeb632e4 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 23 Mar 2026 16:39:48 +0100 Subject: [PATCH 23/24] fix --- src/test/http_openai_handler_test.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 8c4cab90a8..4cce758b45 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1479,13 +1479,9 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesIncomplet // Should NOT have status "completed" ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; - // Should contain new spec-aligned fields - ASSERT_NE(serialized.find("\"error\":null"), std::string::npos) << serialized; - ASSERT_NE(serialized.find("\"previous_response_id\":null"), std::string::npos) << serialized; - ASSERT_NE(serialized.find("\"reasoning\":null"), std::string::npos) << serialized; + // Should contain spec-aligned fields ASSERT_NE(serialized.find("\"store\":true"), std::string::npos) << serialized; ASSERT_NE(serialized.find("\"truncation\":\"disabled\""), std::string::npos) << serialized; - ASSERT_NE(serialized.find("\"user\":null"), std::string::npos) << serialized; ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized; } @@ -1523,13 +1519,9 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesCompleted // Should NOT have incomplete_details ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; - // Should contain new spec-aligned fields - ASSERT_NE(serialized.find("\"error\":null"), std::string::npos) << serialized; - ASSERT_NE(serialized.find("\"previous_response_id\":null"), std::string::npos) << serialized; - ASSERT_NE(serialized.find("\"reasoning\":null"), std::string::npos) << serialized; + // Should contain spec-aligned fields ASSERT_NE(serialized.find("\"store\":true"), std::string::npos) << serialized; ASSERT_NE(serialized.find("\"truncation\":\"disabled\""), std::string::npos) << serialized; - ASSERT_NE(serialized.find("\"user\":null"), std::string::npos) << serialized; ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized; } From 26a37e17e1c707d6762b9599cd829cf658441db0 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Tue, 24 Mar 2026 14:29:53 +0100 Subject: [PATCH 24/24] fix --- .../accuracy/gorilla.patch | 167 +----------------- src/llm/apis/openai_completions.cpp | 64 +++---- 2 files changed, 35 insertions(+), 196 deletions(-) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 17bbeddfce..5e9a8743bc 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -2,15 +2,14 @@ diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config index bb625d2..64c01de 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -@@ -24,6 +24,7 @@ from bfcl_eval.model_handler.api_inference.openai_completion import ( +@@ -24,6 +24,6 @@ from bfcl_eval.model_handler.api_inference.openai_completion import ( OpenAICompletionsHandler, ) from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler -+from bfcl_eval.model_handler.api_inference.ovms_response_stream import OVMSResponsesStreamHandler from bfcl_eval.model_handler.api_inference.qwen import ( QwenAgentNoThinkHandler, QwenAgentThinkHandler, -@@ -2153,6 +2154,54 @@ third_party_inference_model_map = { +@@ -2153,6 +2154,42 @@ third_party_inference_model_map = { is_fc_model=True, underscore_to_dot=True, ), @@ -49,18 +48,6 @@ index bb625d2..64c01de 100644 + output_price=None, + is_fc_model=True, + underscore_to_dot=True, -+ ), -+ "ovms-model-stream-responses": ModelConfig( -+ model_name="ovms-model-stream-responses", -+ display_name="ovms-model-stream-responses", -+ url="http://localhost:8000/v3", -+ org="ovms", -+ license="apache-2.0", -+ model_handler=OVMSResponsesStreamHandler, -+ input_price=None, -+ output_price=None, -+ is_fc_model=True, -+ underscore_to_dot=True, + ), } @@ -165,156 +152,6 @@ index 0953fdd..7f6919f 100644 return self.generate_with_backoff(**kwargs) def _pre_query_processing_prompting(self, test_entry: dict) -> dict: -diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py -new file mode 100644 -index 0000000..bc5ef1e ---- /dev/null -+++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/ovms_response_stream.py -@@ -0,0 +1,144 @@ -+import json -+import os -+ -+from bfcl_eval.model_handler.api_inference.openai_response import OpenAIResponsesHandler -+from overrides import override -+ -+ -+class OVMSResponsesStreamHandler(OpenAIResponsesHandler): -+ """Streaming variant of OpenAIResponsesHandler. -+ -+ Inherits all behavior from the (patched) OpenAIResponsesHandler and only overrides -+ _query_FC, _query_prompting (to add stream=True), and the two -+ _parse_query_response methods (to aggregate SSE events). -+ """ -+ -+ @staticmethod -+ -+ @staticmethod -+ def _parse_stream(stream) -> dict: -+ """Parse responses API SSE stream and return aggregated results.""" -+ text_content = "" -+ reasoning_content = "" -+ tool_calls = {} # keyed by item_id -+ usage = {"input_tokens": 0, "output_tokens": 0} -+ output_items = [] # final output items from response.completed -+ -+ for event in stream: -+ event_type = event.type -+ -+ if event_type == "response.output_text.delta": -+ text_content += event.delta or "" -+ -+ elif event_type == "response.reasoning.delta": -+ reasoning_content += event.delta or "" -+ -+ elif event_type == "response.function_call_arguments.delta": -+ item_id = event.item_id -+ if item_id not in tool_calls: -+ tool_calls[item_id] = {"call_id": "", "name": "", "arguments": ""} -+ tool_calls[item_id]["arguments"] += event.delta or "" -+ -+ elif event_type == "response.output_item.added": -+ item = event.item -+ if hasattr(item, "type") and item.type == "function_call": -+ item_id = item.id -+ tool_calls[item_id] = { -+ "call_id": getattr(item, "call_id", "") or "", -+ "name": getattr(item, "name", "") or "", -+ "arguments": "", -+ } -+ -+ elif event_type in ("response.completed", "response.incomplete"): -+ resp = event.response -+ if hasattr(resp, "usage") and resp.usage: -+ usage["input_tokens"] = resp.usage.input_tokens -+ usage["output_tokens"] = resp.usage.output_tokens -+ if hasattr(resp, "output"): -+ output_items = resp.output -+ -+ return { -+ "text": text_content, -+ "reasoning": reasoning_content, -+ "tool_calls": tool_calls, -+ "usage": usage, -+ "output_items": output_items, -+ } -+ -+ #### FC methods #### -+ -+ @override -+ def _query_FC(self, inference_data: dict): -+ message: list[dict] = inference_data["message"] -+ tools = inference_data["tools"] -+ -+ inference_data["inference_input_log"] = { -+ "message": repr(message), -+ "tools": tools, -+ } -+ -+ kwargs = { -+ "input": message, -+ "model": self.model_name, -+ "temperature": self.temperature, -+ "max_output_tokens": 2048, -+ "tool_choice": os.getenv("TOOL_CHOICE", "auto"), -+ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, -+ "stream": True, -+ } -+ -+ if len(tools) > 0: -+ kwargs["tools"] = tools -+ -+ return self.generate_with_backoff(**kwargs) -+ -+ @override -+ def _parse_query_response_FC(self, api_response) -> dict: -+ parsed = self._parse_stream(api_response) -+ -+ model_responses = [] -+ tool_call_ids = [] -+ -+ for item_id, tc in parsed["tool_calls"].items(): -+ model_responses.append({tc["name"]: tc["arguments"]}) -+ tool_call_ids.append(tc["call_id"]) -+ -+ if not model_responses: -+ model_responses = parsed["text"] -+ -+ return { -+ "model_responses": model_responses, -+ "model_responses_message_for_chat_history": parsed["output_items"], -+ "tool_call_ids": tool_call_ids, -+ "reasoning_content": parsed["reasoning"], -+ "input_token": parsed["usage"]["input_tokens"], -+ "output_token": parsed["usage"]["output_tokens"], -+ } -+ -+ #### Prompting methods #### -+ -+ @override -+ def _query_prompting(self, inference_data: dict): -+ inference_data["inference_input_log"] = {"message": repr(inference_data["message"])} -+ -+ kwargs = { -+ "input": inference_data["message"], -+ "model": self.model_name, -+ "temperature": self.temperature, -+ "extra_body": {"chat_template_kwargs": json.loads(os.getenv("CHAT_TEMPLATE_KWARGS", "{}"))}, -+ "stream": True, -+ } -+ -+ return self.generate_with_backoff(**kwargs) -+ -+ @override -+ def _parse_query_response_prompting(self, api_response) -> dict: -+ parsed = self._parse_stream(api_response) -+ -+ return { -+ "model_responses": parsed["text"], -+ "model_responses_message_for_chat_history": parsed["output_items"], -+ "reasoning_content": parsed["reasoning"], -+ "input_token": parsed["usage"]["input_tokens"], -+ "output_token": parsed["usage"]["output_tokens"], -+ } diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py index 10f1a08..50890c7 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 608a5381fd..641ce9e3cd 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -170,11 +170,11 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(request.temperature.value())); @@ -189,6 +189,7 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writer(request.topP.value())); @@ -242,7 +243,7 @@ void OpenAIChatCompletionsHandler::serializeResponsesResponseObject(Writervalue.GetArray()) { if (!contentItem.IsObject()) { return absl::InvalidArgumentError("input content items must be objects"); @@ -987,20 +988,20 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optionalvalue.IsNull()) { - if (tool_choice_it->value.IsString()) { - tool_choice = tool_choice_it->value.GetString(); - if (tool_choice != "none" && tool_choice != "auto" && tool_choice != "required") + auto toolChoiceIt = doc.FindMember("tool_choice"); + std::string toolChoice{"auto"}; + if (toolChoiceIt != doc.MemberEnd() && !toolChoiceIt->value.IsNull()) { + if (toolChoiceIt->value.IsString()) { + toolChoice = toolChoiceIt->value.GetString(); + if (toolChoice != "none" && toolChoice != "auto" && toolChoice != "required") return absl::InvalidArgumentError("tool_choice should be either 'none' or 'auto' or 'required'"); - } else if (tool_choice_it->value.IsObject()) { - auto toolChoiceObj = tool_choice_it->value.GetObject(); - auto tool_choice_functionIt = toolChoiceObj.FindMember("function"); - if (tool_choice_functionIt != toolChoiceObj.MemberEnd() && tool_choice_functionIt->value.IsObject()) { - auto nameIt = tool_choice_functionIt->value.GetObject().FindMember("name"); - if (nameIt != tool_choice_functionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { - tool_choice = nameIt->value.GetString(); + } else if (toolChoiceIt->value.IsObject()) { + auto toolChoiceObj = toolChoiceIt->value.GetObject(); + auto toolChoiceFunctionIt = toolChoiceObj.FindMember("function"); + if (toolChoiceFunctionIt != toolChoiceObj.MemberEnd() && toolChoiceFunctionIt->value.IsObject()) { + auto nameIt = toolChoiceFunctionIt->value.GetObject().FindMember("name"); + if (nameIt != toolChoiceFunctionIt->value.GetObject().MemberEnd() && nameIt->value.IsString()) { + toolChoice = nameIt->value.GetString(); } else { return absl::InvalidArgumentError("tool_choice.function.name is not a valid string"); } @@ -1011,7 +1012,7 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { if (nameIt == toolChoiceObj.MemberEnd() || !nameIt->value.IsString()) { return absl::InvalidArgumentError("tool_choice.name is not a valid string"); } - tool_choice = nameIt->value.GetString(); + toolChoice = nameIt->value.GetString(); } else { return absl::InvalidArgumentError("tool_choice.function is not a valid JSON object"); } @@ -1021,7 +1022,7 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { } } bool jsonChanged = false; - if (tool_choice == "none") { + if (toolChoice == "none") { // remove tools from the request doc.RemoveMember("tools"); jsonChanged = true; @@ -1073,9 +1074,9 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { } } - // If tool_choice is set to "auto", we keep all tools - // If tool_choice is set to a specific function name, we keep only that tool - if (tool_choice != "auto" && tool_choice != "required" && tool_choice != functionName) { + // If toolChoice is set to "auto", we keep all tools + // If toolChoice is set to a specific function name, we keep only that tool + if (toolChoice != "auto" && toolChoice != "required" && toolChoice != functionName) { it->value.Erase(&obj); jsonChanged = true; continue; @@ -1098,10 +1099,10 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() { } } } else { - tool_choice = "none"; // If tools are not provided, set tool_choice to "none" + toolChoice = "none"; // If tools are not provided, set toolChoice to "none" } - request.toolChoice = tool_choice; + request.toolChoice = toolChoice; if (jsonChanged) { StringBuffer buffer; Writer writer(buffer); @@ -1762,9 +1763,10 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect if (endpoint == Endpoint::RESPONSES) { std::vector parsedOutputs; usage.completionTokens = 0; + constexpr bool echo = false; // echo is not supported in Responses API ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; for (const ov::genai::GenerationOutput& generationOutput : generationOutputs) { - updateUsage(usage, generationOutput.generated_ids, request.echo); + updateUsage(usage, generationOutput.generated_ids, echo); parsedOutputs.push_back(parseOutputIfNeeded(generationOutput.generated_ids)); if (generationOutput.finish_reason == ov::genai::GenerationFinishReason::LENGTH) { responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH; @@ -1844,8 +1846,8 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vect if (i == 0) { jsonResponse.TextOffsetValue(0); } else { - std::string text_before_token = tokenizer.decode(std::vector({generationOutput.generated_ids.begin(), generationOutput.generated_ids.begin() + i})); - jsonResponse.TextOffsetValue(text_before_token.size()); + std::string textBeforeToken = tokenizer.decode(std::vector({generationOutput.generated_ids.begin(), generationOutput.generated_ids.begin() + i})); + jsonResponse.TextOffsetValue(textBeforeToken.size()); } } jsonResponse.EndArray(); @@ -1967,7 +1969,7 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD std::vector parsedOutputs; for (const std::string& text : results.texts) { if (outputParser != nullptr) { - // Same workaround as in chat completions, line part + // Same workaround as in chat completions auto result = tokenizer.encode(text); auto& input_ids = result.input_ids; if (input_ids.get_shape().size() != 2)