diff --git a/common_settings.bzl b/common_settings.bzl index 3d51351f78..2bc23a3430 100644 --- a/common_settings.bzl +++ b/common_settings.bzl @@ -201,6 +201,7 @@ WINDOWS_COMMON_STATIC_LIBS_COPTS = [ "/wd6240", "/wd6326", "/wd6385", + "/wd6386", "/wd6294", "/guard:cf", "/utf-8", diff --git a/src/BUILD b/src/BUILD index 47510cd54d..8acb80130e 100644 --- a/src/BUILD +++ b/src/BUILD @@ -2510,6 +2510,7 @@ cc_test( "//src/llm:genai_servables", "//src/llm:output_parsers", ":test_llm_output_parser_tests", + ":test_llm_input_processing_tests", "//src/test/mediapipe/calculators:mediapipe_test_calculators", "//src/test/mediapipe/calculators:dependency_free_http_test_calculators", "@mediapipe//mediapipe/calculators/ovms:ovms_calculator", @@ -3035,6 +3036,20 @@ cc_library( local_defines = COMMON_LOCAL_DEFINES, ) +cc_library( + name = "test_llm_input_processing_tests", + linkstatic = 1, + alwayslink = True, + srcs = glob(["test/llm/input_processing/*_test.cpp"]), + deps = [ + "@com_google_googletest//:gtest", + ":test_platform_utils", + "//src/llm:io_processing_input_processors", + ], + copts = COPTS_TESTS, + local_defines = COMMON_LOCAL_DEFINES, +) + ovms_cc_library( name = "capimodule", hdrs = ["capi_frontend/capimodule.hpp"], diff --git a/src/llm/BUILD b/src/llm/BUILD index 397069b9de..a7128989da 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -62,6 +62,7 @@ ovms_cc_library( "@stb//:image", ":openai_request", ":output_parsers", + ":generation_config_builders", "//third_party:genai",], visibility = ["//visibility:public"], ) @@ -81,6 +82,7 @@ ovms_cc_library( ":openai_api_handler", ":openai_request", ":output_parsers", + ":io_processing_input_processor", "//third_party:genai",], visibility = ["//visibility:public"], ) @@ -104,13 +106,85 @@ ovms_cc_library( visibility = ["//visibility:public"], ) +ovms_cc_library( + name = "io_processing_input_request", + hdrs = ["io_processing/input_request.hpp", + "io_processing/input_processing_config.hpp", + "io_processing/base_input_processor.hpp"], + srcs = [], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//third_party:genai", + ], + visibility = ["//visibility:public"], +) + +ovms_cc_library( + name = "io_processing_input_processor_context", + hdrs = ["io_processing/input_processor_context.hpp"], + srcs = [], + deps = [ + ":io_processing_input_request", + "//third_party:genai", + ] + select({ + "//:disable_python": [], + "//:not_disable_python": [":py_jinja_template_processor"] + PYBIND_DEPS, + }), + visibility = ["//visibility:public"], +) + +ovms_cc_library( + name = "io_processing_input_processors", + hdrs = ["io_processing/input_processors/image_decoding_processor.hpp", + "io_processing/input_processors/chat_template_processor.hpp", + "io_processing/input_processors/raw_prompt_extractor.hpp", + "io_processing/input_processors/text_content_normalization_processor.hpp", + "io_processing/input_processors/tokenization_processor.hpp"], + srcs = ["io_processing/input_processors/image_decoding_processor.cpp", + "io_processing/input_processors/chat_template_processor.cpp", + "io_processing/input_processors/text_content_normalization_processor.cpp", + "io_processing/input_processors/tokenization_processor.cpp"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + "//src:libovmsprofiler", + "//third_party:curl", + "//src:image_conversion", + "//src/filesystem:libovmsfilesystem", + "@stb//:image", + ":io_processing_input_request", + ":openai_api_handler", + "//third_party:genai", + "//src:libovmslogging", + ] + select({ + "//:disable_python": [], + "//:not_disable_python": [":py_jinja_template_processor"] + PYBIND_DEPS, + }), + visibility = ["//visibility:public"], +) + +ovms_cc_library( + name = "io_processing_input_processor", + hdrs = ["io_processing/input_processor.hpp"], + srcs = ["io_processing/input_processor.cpp"], + deps = [ + "@mediapipe//mediapipe/framework:calculator_framework", + ":io_processing_input_request", + ":io_processing_input_processors", + ":io_processing_input_processor_context", + "//src:libovms_config", + "//third_party:genai", + ], + visibility = ["//visibility:public"], +) + ovms_cc_library( name = "openai_request", hdrs = ["apis/openai_request.hpp"], srcs = [], - deps = ["//third_party:genai", + deps = ["//third_party:genai", "//src/port:rapidjson_document", - ":apis_tool_schema_wrapper",], + ":apis_tool_schema_wrapper", + ":io_processing_input_request",], visibility = ["//visibility:public"], ) @@ -344,6 +418,7 @@ ovms_cc_library( ":openai_completions_api_handler", ":openai_responses_handler", ":generation_config_builders", + ":io_processing_input_processor_context", "//src:httppayload", "//src:libhttpclientconnection", "//src:sse_utils", diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index 0e96e9b335..b8b9776199 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -31,6 +31,7 @@ #include "../../logging.hpp" #include "../../profiler.hpp" #include "../../filesystem/filesystem.hpp" +#include "../io_processing/generation_config_builder.hpp" #pragma warning(push) #pragma warning(disable : 6001 4324 6385 6386) #include "absl/strings/escaping.h" @@ -286,7 +287,7 @@ absl::Status OpenAIApiHandler::parseResponseFormat() { // --- Shared parsing methods --- -absl::Status OpenAIApiHandler::ensureArgumentsInToolCalls(Value& messageObj, bool& jsonChanged) { +absl::Status OpenAIApiHandler::ensureArgumentsInToolCalls(Value& messageObj) { auto& allocator = doc.GetAllocator(); auto toolCallsIt = messageObj.FindMember("tool_calls"); if (toolCallsIt != messageObj.MemberEnd() && toolCallsIt->value.IsArray()) { @@ -307,7 +308,6 @@ absl::Status OpenAIApiHandler::ensureArgumentsInToolCalls(Value& messageObj, boo rapidjson::Value argumentsValue; argumentsValue.SetString("{}", allocator); functionIt->value.GetObject().AddMember(argumentsKey, argumentsValue, allocator); - jsonChanged = true; } } } @@ -348,11 +348,9 @@ absl::Status OpenAIApiHandler::parseTools() { return absl::InvalidArgumentError("tool_choice is not a valid JSON object or string"); } } - bool jsonChanged = false; if (toolChoice == "none") { // remove tools from the request doc.RemoveMember("tools"); - jsonChanged = true; } auto it = doc.FindMember("tools"); if (it != doc.MemberEnd() && !it->value.IsNull()) { @@ -405,7 +403,6 @@ absl::Status OpenAIApiHandler::parseTools() { // If toolChoice is set to a specific function name, we keep only that tool if (toolChoice != "auto" && toolChoice != "required" && toolChoice != functionName) { it->value.Erase(&obj); - jsonChanged = true; continue; } @@ -430,12 +427,6 @@ absl::Status OpenAIApiHandler::parseTools() { } request.toolChoice = toolChoice; - if (jsonChanged) { - StringBuffer buffer; - Writer writer(buffer); - doc.Accept(writer); - request.processedJson = buffer.GetString(); - } return absl::OkStatus(); } @@ -492,18 +483,48 @@ const OpenAIRequest& OpenAIApiHandler::getRequest() const { return request; } -const std::string& OpenAIApiHandler::getProcessedJson() const { - return request.processedJson; -} - -const ImageHistory& OpenAIApiHandler::getImageHistory() const { - return request.imageHistory; -} - ov::genai::ChatHistory& OpenAIApiHandler::getChatHistory() { return request.chatHistory; } +absl::StatusOr OpenAIApiHandler::extractInputRequest(GenerationConfigBuilder& configBuilder) { + configBuilder.parseConfigFromRequest(request); + configBuilder.adjustConfigForDecodingMethod(); + try { + configBuilder.validateStructuredOutputConfig(tokenizer); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool guided generation will not be applied due to JSON schema validation failure: {}", e.what()); + configBuilder.unsetStructuredOutputConfig(); + } + InputRequest req; + req.generationConfig = configBuilder.getConfig(); + if (endpoint == Endpoint::COMPLETIONS) { + req.input = request.prompt.value_or(""); + } else { + // CHAT_COMPLETIONS and RESPONSES both use ChatHistory. + // Copied (not moved) so the handler retains its own copy for response serialization. + req.input = request.chatHistory; + // Populate tools and chat_template_kwargs on the copied ChatHistory so + // ChatTemplateProcessor can access them via get_tools()/get_extra_context(). + auto& chatHistory = std::get(req.input); + auto toolsResult = parseToolsToJsonContainer(); + if (!toolsResult.ok()) { + return toolsResult.status(); + } + if (toolsResult.value().has_value()) { + chatHistory.set_tools(toolsResult.value().value()); + } + auto kwargsResult = parseChatTemplateKwargsToJsonContainer(); + if (!kwargsResult.ok()) { + return kwargsResult.status(); + } + if (kwargsResult.value().has_value()) { + chatHistory.set_extra_context(kwargsResult.value().value()); + } + } + return req; +} + std::optional OpenAIApiHandler::getMaxTokens() const { return request.maxTokens; } diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index c85c066ba0..e9589f0f3a 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -36,6 +36,7 @@ #include "absl/status/statusor.h" #pragma warning(pop) #include "../io_processing/output_parser.hpp" +#include "../io_processing/input_request.hpp" #include "openai_request.hpp" // Forward declarations for types only used by reference in virtual method signatures @@ -50,6 +51,8 @@ using namespace rapidjson; namespace ovms { +class GenerationConfigBuilder; + ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& value); enum class Endpoint { @@ -113,7 +116,7 @@ class OpenAIApiHandler { // Shared parsing helpers absl::Status parseCommonPart(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength); absl::Status parseResponseFormat(); - absl::Status ensureArgumentsInToolCalls(Value& messageObj, bool& jsonChanged); + absl::Status ensureArgumentsInToolCalls(Value& messageObj); ParsedOutput parseOutputIfNeeded(const std::vector& generatedIds); // Shared VLM workaround: encode text to tokens using tokenizer, validates shape @@ -156,8 +159,6 @@ class OpenAIApiHandler { std::optional getPrompt() const; std::optional getNumReturnSequences() const; StreamOptions getStreamOptions() const; - const std::string& getProcessedJson() const; - const ImageHistory& getImageHistory() const; ov::genai::ChatHistory& getChatHistory(); std::optional getMaxTokens() const; std::optional getResponseFormat() const; @@ -166,6 +167,10 @@ class OpenAIApiHandler { std::string getModel() const; std::string getToolChoice() const; const std::shared_ptr& getOutputParser() const; + // Builds a complete InputRequest: runs the full generation config pipeline + // (parse → adjust → validate) on the provided builder using this handler's + // request and tokenizer, then populates input from the parsed request. + absl::StatusOr extractInputRequest(GenerationConfigBuilder& configBuilder); // Verbose response configuration void enableVerboseResponse(const std::string& promptAfterTemplate) { diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index adea6f075f..2ac285e61e 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -163,7 +163,6 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optionalvalue.GetArray().Size() == 0) return absl::InvalidArgumentError("Messages array cannot be empty"); - bool jsonChanged = false; for (size_t i = 0; i < it->value.GetArray().Size(); i++) { auto& obj = it->value.GetArray()[i]; if (!obj.IsObject()) @@ -196,57 +195,38 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optionalvalue.IsArray()) { - // Adjust content field format when it is passed as an array of objects (typically with images) + // Validate the content array and check whether it contains images. if (member->value.GetArray().Size() == 0) { return absl::InvalidArgumentError("Invalid message structure - content array is empty"); } - jsonChanged = true; - std::string combinedText; - for (auto& v : member->value.GetArray()) { + for (const auto& v : member->value.GetArray()) { if (!v.IsObject()) { return absl::InvalidArgumentError("Invalid message structure - content array should contain objects"); } - auto entry = v.GetObject(); + const auto entry = v.GetObject(); if (!entry.HasMember("type") || !entry["type"].IsString()) { return absl::InvalidArgumentError("Invalid message structure - content object type missing"); } - auto entryType = entry["type"].GetString(); - if (entryType == std::string("text")) { + const std::string entryType = entry["type"].GetString(); + if (entryType == "text") { if (!entry.HasMember("text") || !entry["text"].IsString()) { return absl::InvalidArgumentError("Invalid message structure - content text missing"); } - if (!combinedText.empty()) { - combinedText += "\n"; - } - combinedText.append(entry["text"].GetString(), entry["text"].GetStringLength()); - continue; - } else if (entryType == std::string("image_url")) { + } else if (entryType == "image_url") { if (!entry.HasMember("image_url") || !entry["image_url"].IsObject()) { return absl::InvalidArgumentError("Invalid message structure - content image_url missing"); } - auto imageUrl = entry["image_url"].GetObject(); + const auto imageUrl = entry["image_url"].GetObject(); if (!imageUrl.HasMember("url") || !imageUrl["url"].IsString()) { return absl::InvalidArgumentError("Invalid message structure - image_url does not have url field"); } - std::string url = imageUrl["url"].GetString(); - auto tensorResult = loadImage(url, allowedLocalMediaPath, allowedMediaDomains); - if (!tensorResult.ok()) { - return tensorResult.status(); - } - request.imageHistory.push_back({i, tensorResult.value()}); } else { return absl::InvalidArgumentError("Unsupported content type"); } } - // Flatten all text parts (joined with newlines) into the "content" field. - // Images are stored separately in request.imageHistory. - Value contentText(rapidjson::kStringType); - contentText.SetString(combinedText.c_str(), combinedText.length(), doc.GetAllocator()); - member->value = contentText; - // Add new field to the last message in history if content is text - if (member->value.IsString()) { - request.chatHistory.last()[member->name.GetString()] = member->value.GetString(); - } + // Preserve content array for downstream processors + // (ImageDecodingProcessor for VLM, TextContentNormalizationProcessor for LM). + request.chatHistory.last()[memberName] = rapidJsonValueToJsonContainer(member->value); } } auto lastMessage = request.chatHistory.last(); @@ -256,21 +236,13 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional writer(buffer); - doc.Accept(writer); - request.processedJson = buffer.GetString(); - } SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed messages successfully"); return absl::OkStatus(); } diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index 24327be44f..5b433a7a0b 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -33,16 +33,12 @@ #include "tool_schema_wrapper.hpp" namespace ovms { -using ImageHistory = std::vector>; - struct StreamOptions { bool includeUsage = false; }; // Class that maps OpenAI request content. struct OpenAIRequest { ov::genai::ChatHistory chatHistory; - std::string processedJson; - ImageHistory imageHistory; std::optional prompt{std::nullopt}; bool stream{false}; StreamOptions streamOptions; diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index ab9487a68a..228e84154a 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -61,7 +61,7 @@ static std::string joinServerSideEvents(const std::vector& events) // nested format ({type:"function", function:{name, description, parameters, ...}}) in place // on the request document. The chat template (e.g. gpt-oss) and the chat/completions tools // schema both expect the nested shape; doing this once up front lets every downstream -// consumer (chat history path, processedJson builder for Python Jinja, parseToolsToJsonContainer) +// consumer (chat history path, parseToolsToJsonContainer) // share the same representation. Tools already in nested form, or non-function tools, are // left untouched. static void convertResponsesToolsInPlace(rapidjson::Value& toolsArray, rapidjson::Document::AllocatorType& alloc) { @@ -134,38 +134,6 @@ static std::string extractReasoningText(const rapidjson::Value::ConstObject& ite return ""; } -// Extract a flat text string from a Responses API content field which may be -// either a string or an array of {type,text} objects. When multiple text items -// are present, the last one wins, matching ChatHistorySink::extractContent so -// the Python/Jinja processedJson path and the C++ chatHistory path produce the -// same prompt. -#if (PYTHON_DISABLE == 0) -static std::string extractTextContent(const rapidjson::Value& contentVal) { - if (contentVal.IsString()) { - return contentVal.GetString(); - } - if (!contentVal.IsArray()) { - return ""; - } - std::string result; - for (const auto& ci : contentVal.GetArray()) { - if (!ci.IsObject()) - continue; - auto ctTypeIt = ci.GetObject().FindMember("type"); - if (ctTypeIt == ci.GetObject().MemberEnd() || !ctTypeIt->value.IsString()) - continue; - const std::string ctType = ctTypeIt->value.GetString(); - if (ctType == "input_text" || ctType == "output_text") { - auto textIt = ci.GetObject().FindMember("text"); - if (textIt != ci.GetObject().MemberEnd() && textIt->value.IsString()) { - result = textIt->value.GetString(); - } - } - } - return result; -} -#endif - // Read the three string fields (id, name, arguments) out of a function_call item. // // The Responses API function_call item carries both "id" (the *item* id, e.g. @@ -217,7 +185,7 @@ static absl::Status validateFunctionCallItem(const rapidjson::Value& item) { } // Build a chat/completions tool_calls[] array into outArr using the given -// allocator. Shared by ChatHistorySink and ProcessedJsonSink so the two paths +// allocator. Shared by ChatHistorySink so the two paths // cannot drift on id/call_id handling or field layout. static void buildToolCallsArray(const std::vector& toolCalls, rapidjson::Value& outArr, rapidjson::Document::AllocatorType& alloc) { @@ -368,10 +336,7 @@ class ResponsesInputBuilder { // as a standalone assistant turn.) // // Flushing BEFORE extractContent is intentional: it makes - // chatHistory.size() equal the index this item's message will land at, - // so the ChatHistorySink can record image-history turn indices directly - // from chatHistory.size() instead of from the Responses input-array - // index (which drifts when items are buffered/merged). + // chatHistory.size() equal the index this item's message will land at. if (role != "assistant") { flushPendingFunctionCalls(""); } @@ -425,23 +390,21 @@ class ResponsesInputBuilder { // Sink that appends to ov::genai::ChatHistory (used when Python is disabled // or as the fallback C++ chat-history path). Owns a scratch rapidjson document -// whose allocator backs the tool_calls Values until they are deep-copied into -// a JsonContainer. +// whose allocator backs tool_calls Values and pending content arrays until they +// are deep-copied into a JsonContainer. class ChatHistorySink { public: - ChatHistorySink(ov::genai::ChatHistory& chatHistory, ImageHistory& imageHistory, - const std::optional& allowedLocalMediaPath, - const std::optional>& allowedMediaDomains) : - chatHistory(chatHistory), - imageHistory(imageHistory), - allowedLocalMediaPath(allowedLocalMediaPath), - allowedMediaDomains(allowedMediaDomains) { + explicit ChatHistorySink(ov::genai::ChatHistory& chatHistory) : + chatHistory(chatHistory) { scratchDoc.SetObject(); + pendingContentArray.SetArray(); } absl::Status extractContent(const rapidjson::Value::ConstObject& itemObj, rapidjson::SizeType /*index*/, std::string& outText) { outText.clear(); + hasPendingContent = false; + pendingContentArray.SetArray(); auto contentIt = itemObj.FindMember("content"); if (contentIt == itemObj.MemberEnd()) return absl::InvalidArgumentError("input item is missing required content field"); @@ -453,6 +416,9 @@ class ChatHistorySink { return absl::InvalidArgumentError("input item content must be a string or array"); if (contentIt->value.Empty()) return absl::InvalidArgumentError("input item content array must not be empty"); + // Build a content array in chat/completions format for downstream processors: + // input_text/output_text → {"type":"text","text":"..."} + // input_image → {"type":"image_url","image_url":{"url":"..."}} for (const auto& contentItem : contentIt->value.GetArray()) { if (!contentItem.IsObject()) return absl::InvalidArgumentError("input content items must be objects"); @@ -465,16 +431,21 @@ class ChatHistorySink { auto textIt = contentObj.FindMember("text"); if (textIt == contentObj.MemberEnd() || !textIt->value.IsString()) return absl::InvalidArgumentError(absl::StrCat(type, " requires a valid text field")); - // Last text-bearing item wins, matching pre-refactor behaviour. - outText = textIt->value.GetString(); + rapidjson::Value textEntry(rapidjson::kObjectType); + textEntry.AddMember("type", rapidjson::Value("text", scratchDoc.GetAllocator()), scratchDoc.GetAllocator()); + textEntry.AddMember("text", rapidjson::Value(textIt->value.GetString(), scratchDoc.GetAllocator()), scratchDoc.GetAllocator()); + pendingContentArray.PushBack(textEntry, scratchDoc.GetAllocator()); } else if (type == "input_image") { - auto status = appendInputImage(contentObj); + auto status = buildImageEntry(contentObj); if (!status.ok()) return status; } else { return absl::InvalidArgumentError(absl::StrCat("unsupported input content item type: ", type)); } } + // Preserve content array for downstream processors + // (ImageDecodingProcessor for VLM, TextContentNormalizationProcessor for LM). + hasPendingContent = true; return absl::OkStatus(); } @@ -489,7 +460,14 @@ class ChatHistorySink { void emitMessage(const std::string& role, const std::string& contentText, const std::string& reasoning) { chatHistory.push_back({}); chatHistory.last()["role"] = role; - chatHistory.last()["content"] = contentText; + if (hasPendingContent) { + // Preserve the content array for ImageDecodingProcessor (VLM) or + // TextContentNormalizationProcessor (LM). + chatHistory.last()["content"] = rapidJsonValueToJsonContainer(pendingContentArray); + hasPendingContent = false; + } else { + chatHistory.last()["content"] = contentText; + } if (!reasoning.empty()) chatHistory.last()["reasoning_content"] = reasoning; } @@ -498,7 +476,12 @@ class ChatHistorySink { const std::vector& toolCalls) { chatHistory.push_back({}); chatHistory.last()["role"] = "assistant"; - chatHistory.last()["content"] = contentText; + if (hasPendingContent) { + chatHistory.last()["content"] = rapidJsonValueToJsonContainer(pendingContentArray); + hasPendingContent = false; + } else { + chatHistory.last()["content"] = contentText; + } if (!reasoning.empty()) chatHistory.last()["reasoning_content"] = reasoning; auto& alloc = scratchDoc.GetAllocator(); @@ -524,12 +507,10 @@ class ChatHistorySink { } private: - // Record (chatTurnIndex, tensor) immediately. This is correct because - // onRoleItem() flushes any buffered standalone messages BEFORE calling - // extractContent(), so at this point chatHistory.size() is the index that - // the upcoming emitMessage()/emitAssistantWithToolCalls() will push the - // image-bearing message into. - absl::Status appendInputImage(const rapidjson::Value::ConstObject& contentObj) { + // Append a {"type":"image_url","image_url":{"url":"..."}} entry to + // pendingContentArray. Actual image decoding is deferred to + // ImageDecodingProcessor; the URL is preserved so it can locate the image later. + absl::Status buildImageEntry(const rapidjson::Value::ConstObject& contentObj) { auto imageUrlIt = contentObj.FindMember("image_url"); if (imageUrlIt == contentObj.MemberEnd()) return absl::InvalidArgumentError("input_image requires image_url field"); @@ -547,89 +528,21 @@ class ChatHistorySink { return absl::InvalidArgumentError("input_image.image_url must be a string or object"); } - auto tensorResult = loadImage(imageUrl, allowedLocalMediaPath, allowedMediaDomains); - if (!tensorResult.ok()) - return tensorResult.status(); - imageHistory.push_back({chatHistory.size(), tensorResult.value()}); + rapidjson::Value imageUrlObj(rapidjson::kObjectType); + imageUrlObj.AddMember("url", rapidjson::Value(imageUrl.c_str(), scratchDoc.GetAllocator()), scratchDoc.GetAllocator()); + rapidjson::Value entry(rapidjson::kObjectType); + entry.AddMember("type", rapidjson::Value("image_url", scratchDoc.GetAllocator()), scratchDoc.GetAllocator()); + entry.AddMember("image_url", imageUrlObj, scratchDoc.GetAllocator()); + pendingContentArray.PushBack(entry, scratchDoc.GetAllocator()); return absl::OkStatus(); } ov::genai::ChatHistory& chatHistory; - ImageHistory& imageHistory; - const std::optional& allowedLocalMediaPath; - const std::optional>& allowedMediaDomains; rapidjson::Document scratchDoc; + rapidjson::Value pendingContentArray{rapidjson::kArrayType}; + bool hasPendingContent = false; }; -#if (PYTHON_DISABLE == 0) -// Sink that appends to a rapidjson messages array, used to feed the Python -// Jinja chat template path. Image content items are silently dropped (the -// Python path receives only text). -class ProcessedJsonSink { -public: - ProcessedJsonSink(rapidjson::Value& messagesArray, rapidjson::Document::AllocatorType& alloc) : - messagesArray(messagesArray), - alloc(alloc) {} - - absl::Status extractContent(const rapidjson::Value::ConstObject& itemObj, - rapidjson::SizeType /*index*/, std::string& outText) { - auto contentIt = itemObj.FindMember("content"); - outText = (contentIt != itemObj.MemberEnd()) ? extractTextContent(contentIt->value) : ""; - return absl::OkStatus(); - } - - void emitToolMessage(const std::string& callId, const std::string& output) { - rapidjson::Value msgObj(rapidjson::kObjectType); - msgObj.AddMember("role", rapidjson::Value("tool", alloc), alloc); - if (!callId.empty()) - msgObj.AddMember("tool_call_id", rapidjson::Value(callId.c_str(), alloc), alloc); - msgObj.AddMember("content", rapidjson::Value(output.c_str(), alloc), alloc); - messagesArray.PushBack(msgObj, alloc); - } - - void emitMessage(const std::string& role, const std::string& contentText, const std::string& reasoning) { - rapidjson::Value msgObj(rapidjson::kObjectType); - msgObj.AddMember("role", rapidjson::Value(role.c_str(), alloc), alloc); - msgObj.AddMember("content", rapidjson::Value(contentText.c_str(), alloc), alloc); - if (!reasoning.empty()) - msgObj.AddMember("reasoning_content", rapidjson::Value(reasoning.c_str(), alloc), alloc); - messagesArray.PushBack(msgObj, alloc); - } - - // Emit an assistant turn that carries only reasoning_content (no - // tool_calls). See ChatHistorySink::emitStandaloneReasoning for rationale. - void emitStandaloneReasoning(const std::string& reasoning) { - rapidjson::Value msgObj(rapidjson::kObjectType); - msgObj.AddMember("role", rapidjson::Value("assistant", alloc), alloc); - msgObj.AddMember("content", rapidjson::Value("", alloc), alloc); - msgObj.AddMember("reasoning_content", rapidjson::Value(reasoning.c_str(), alloc), alloc); - messagesArray.PushBack(msgObj, alloc); - } - - void emitAssistantWithToolCalls(const std::string& contentText, const std::string& reasoning, - const std::vector& toolCalls) { - rapidjson::Value msgObj(rapidjson::kObjectType); - msgObj.AddMember("role", rapidjson::Value("assistant", alloc), alloc); - msgObj.AddMember("content", rapidjson::Value(contentText.c_str(), alloc), alloc); - if (!reasoning.empty()) - msgObj.AddMember("reasoning_content", rapidjson::Value(reasoning.c_str(), alloc), alloc); - rapidjson::Value toolCallsArray(rapidjson::kArrayType); - buildToolCallsArray(toolCalls, toolCallsArray, alloc); - msgObj.AddMember("tool_calls", toolCallsArray, alloc); - messagesArray.PushBack(msgObj, alloc); - } - - absl::Status onMissingRole(const rapidjson::Value::ConstObject&) { - // Silently skip unknown items without a role in the processed JSON path. - return absl::OkStatus(); - } - -private: - rapidjson::Value& messagesArray; - rapidjson::Document::AllocatorType& alloc; -}; -#endif // PYTHON_DISABLE == 0 - // --- Request parsing --- absl::Status OpenAIResponsesHandler::parseRequest(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength, @@ -660,8 +573,7 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional allow if (inputIt->value.GetArray().Size() == 0) { return absl::InvalidArgumentError("input array must not be empty"); } - ChatHistorySink sink(request.chatHistory, request.imageHistory, - allowedLocalMediaPath, allowedMediaDomains); + ChatHistorySink sink(request.chatHistory); ResponsesInputBuilder builder(sink); auto status = builder.build(inputIt->value); if (!status.ok()) { @@ -688,8 +600,7 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional } // Convert tools array (Responses-flat -> chat/completions-nested) once, in place, - // before any consumer reads it. parseInput, parseToolsToJsonContainer and the - // processedJson builder all rely on the nested shape. + // before any consumer reads it. parseInput and parseToolsToJsonContainer both rely on the nested shape. auto toolsIt = doc.FindMember("tools"); if (toolsIt != doc.MemberEnd() && toolsIt->value.IsArray()) { convertResponsesToolsInPlace(toolsIt->value, doc.GetAllocator()); @@ -749,64 +660,6 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional return toolsStatus; } -#if (PYTHON_DISABLE == 0) - // Build processedJson with a "messages" array in chat/completions format so that - // the Python Jinja template path can consume Responses API input without a separate code path. - // Handles reasoning, function_call (merged into assistant tool_calls), and - // function_call_output (converted to role:tool messages). - // - // Built after parseTools() so any tool filtering (e.g. tool_choice removing - // unselected tools) is reflected here, and so parseTools()'s own write to - // request.processedJson (Responses-shaped doc with "input") does not - // clobber the chat/completions-shaped JSON the Python Jinja path expects. - { - Document processedDoc; - processedDoc.SetObject(); - auto& alloc = processedDoc.GetAllocator(); - - Value messagesArray(kArrayType); - - auto inputArrIt = doc.FindMember("input"); - if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsArray()) { - ProcessedJsonSink sink(messagesArray, alloc); - ResponsesInputBuilder builder(sink); - auto processedStatus = builder.build(inputArrIt->value); - if (!processedStatus.ok()) { - return processedStatus; - } - } else if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsString()) { - // String input: emit a single user message so the Python Jinja path - // sees the same content the C++ chatHistory path does. - Value msgObj(kObjectType); - msgObj.AddMember("role", Value("user", alloc), alloc); - msgObj.AddMember("content", Value(inputArrIt->value.GetString(), alloc), alloc); - messagesArray.PushBack(msgObj, alloc); - } - - processedDoc.AddMember("messages", messagesArray, alloc); - - // Tools were already normalised to chat/completions nested format by - // convertResponsesToolsInPlace earlier in parseResponsesPart — just copy verbatim. - auto processedToolsIt = doc.FindMember("tools"); - if (processedToolsIt != doc.MemberEnd() && !processedToolsIt->value.IsNull()) { - Value toolsCopy(processedToolsIt->value, alloc); - processedDoc.AddMember("tools", toolsCopy, alloc); - } - - // Copy chat_template_kwargs from original doc if present - auto kwargsIt = doc.FindMember("chat_template_kwargs"); - if (kwargsIt != doc.MemberEnd() && !kwargsIt->value.IsNull()) { - Value kwargsCopy(kwargsIt->value, alloc); - processedDoc.AddMember("chat_template_kwargs", kwargsCopy, alloc); - } - - StringBuffer buffer; - Writer writer(buffer); - processedDoc.Accept(writer); - request.processedJson = buffer.GetString(); - } -#endif - // max_output_tokens: uint; optional // OpenAI Responses API uses this field for output token limit. it = doc.FindMember("max_output_tokens"); diff --git a/src/llm/io_processing/base_input_processor.hpp b/src/llm/io_processing/base_input_processor.hpp new file mode 100644 index 0000000000..d9c5a0772a --- /dev/null +++ b/src/llm/io_processing/base_input_processor.hpp @@ -0,0 +1,40 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 6001 4324 6385 6386) +#endif +#include "absl/status/status.h" +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#include "input_request.hpp" + +namespace ovms { + +// Abstract base for a single step in the input processing chain. +class BaseInputProcessor { +public: + virtual ~BaseInputProcessor() = default; + + // Transform req in-place. A non-OK status aborts the chain. + virtual absl::Status process(InputRequest& req) = 0; +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processing_config.hpp b/src/llm/io_processing/input_processing_config.hpp new file mode 100644 index 0000000000..d92d0d7a7d --- /dev/null +++ b/src/llm/io_processing/input_processing_config.hpp @@ -0,0 +1,32 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +namespace ovms { + +// Deployment-level configuration for InputProcessor, populated once at servable init. +struct InputProcessingConfig { + // True for VLM servables. Enables ImageDecodingProcessor; TokenizationProcessor + // still runs (to populate inputIds for usage statistics and max-length checks) + // but inputIds is not passed to the VLM pipeline for inference. + bool isVLM = false; + // True when the GenAI built-in tokenizer.apply_chat_template() should be used + // even on Python-enabled builds (i.e. ChatTemplateMode::MINJA). + // False (default) uses PyJinjaTemplateProcessor when PYTHON_DISABLE==0. + bool useMinja = false; +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processor.cpp b/src/llm/io_processing/input_processor.cpp new file mode 100644 index 0000000000..3c26b82e6a --- /dev/null +++ b/src/llm/io_processing/input_processor.cpp @@ -0,0 +1,87 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "input_processor.hpp" + +#include +#include + +#include "../../config.hpp" +#include "../../logging.hpp" +#include "input_processors/chat_template_processor.hpp" +#include "input_processors/image_decoding_processor.hpp" +#include "input_processors/raw_prompt_extractor.hpp" +#include "input_processors/text_content_normalization_processor.hpp" +#include "input_processors/tokenization_processor.hpp" + +namespace ovms { + +InputProcessor::InputProcessor(InputProcessorContext& context, + const InputRequest& req) { + const bool isChatPath = std::holds_alternative(req.input); + // Chat template already adds special tokens; completions path needs them added by the tokenizer. + const bool addSpecialTokens = !isChatPath; + + if (context.config.isVLM && isChatPath) { + const auto& settings = Config::instance().getServerSettings(); + processors.emplace_back(std::make_unique( + settings.allowedLocalMediaPath, + settings.allowedMediaDomains)); + } + + if (!context.config.isVLM && isChatPath) { + processors.emplace_back(std::make_unique()); + } + + if (isChatPath) { +#if (PYTHON_DISABLE == 0) + // Select the path at construction time. If !useMinja but templateProcessor is null + // (shouldn't happen on a properly initialized servable), fall back to the native path. + if (!context.config.useMinja && context.templateProcessor != nullptr) { + processors.emplace_back(std::make_unique( + context.tokenizer, *context.templateProcessor)); + } else { + processors.emplace_back(std::make_unique(context.tokenizer)); + } +#else + processors.emplace_back(std::make_unique(context.tokenizer)); +#endif + } else { + processors.emplace_back(std::make_unique()); + } + + // TokenizationProcessor runs for all paths: + // - LM: provides inputIds for inference. + // - VLM: provides inputIds for usage statistics and max-length checks + // (the VLM pipeline tokenizes internally; inputIds is not passed to it). + if (!context.config.isVLM || isChatPath) { + processors.emplace_back(std::make_unique( + context.tokenizer, addSpecialTokens)); + } +} + +absl::Status InputProcessor::process(InputRequest& req) { + for (auto& processor : processors) { + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "InputProcessor: executing {}", typeid(*processor).name()); + auto status = processor->process(req); + if (!status.ok()) { + return status; + } + } + return absl::OkStatus(); +} + +} // namespace ovms diff --git a/src/llm/io_processing/input_processor.hpp b/src/llm/io_processing/input_processor.hpp new file mode 100644 index 0000000000..671dd6ce2e --- /dev/null +++ b/src/llm/io_processing/input_processor.hpp @@ -0,0 +1,45 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include + +#include "absl/status/status.h" + +#include "base_input_processor.hpp" +#include "input_processing_config.hpp" +#include "input_processor_context.hpp" +#include "input_request.hpp" + +namespace ovms { + +// Orchestrates the input processing chain. +// The constructor selects concrete processors based on InputProcessorContext +// and the active InputPayload variant. The chain composition is an implementation detail. +class InputProcessor { +public: + InputProcessor(InputProcessorContext& context, + const InputRequest& req); + + // Execute the chain in order. Returns the first non-OK status encountered. + absl::Status process(InputRequest& req); + +private: + std::vector> processors; +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processor_context.hpp b/src/llm/io_processing/input_processor_context.hpp new file mode 100644 index 0000000000..6d2a0c62a2 --- /dev/null +++ b/src/llm/io_processing/input_processor_context.hpp @@ -0,0 +1,39 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include + +#include + +#include "input_processing_config.hpp" +#if (PYTHON_DISABLE == 0) +#include "../py_jinja_template_processor.hpp" +#endif + +namespace ovms { + +// Holds the per-deployment resources needed by InputProcessor. +// Created once during servable initialization; reused across requests. +struct InputProcessorContext { + InputProcessingConfig config; + ov::genai::Tokenizer tokenizer; +#if (PYTHON_DISABLE == 0) + PyJinjaTemplateProcessor* templateProcessor = nullptr; +#endif +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/chat_template_processor.cpp b/src/llm/io_processing/input_processors/chat_template_processor.cpp new file mode 100644 index 0000000000..16639f1e09 --- /dev/null +++ b/src/llm/io_processing/input_processors/chat_template_processor.cpp @@ -0,0 +1,99 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "chat_template_processor.hpp" + +#include +#include +#include + +#include "../../../logging.hpp" + +namespace ovms { + +#if (PYTHON_DISABLE == 0) +ChatTemplateProcessor::ChatTemplateProcessor(ov::genai::Tokenizer& tokenizer, + PyJinjaTemplateProcessor& templateProcessor) : + tokenizer(&tokenizer), + templateProcessor(templateProcessor) {} + +ChatTemplateProcessor::ChatTemplateProcessor(ov::genai::Tokenizer& tokenizer) : + tokenizer(&tokenizer), + templateProcessor(std::nullopt) {} + +std::string ChatTemplateProcessor::serializeForPyJinja(const ov::genai::ChatHistory& chatHistory) { + // Build the minimal JSON object that PyJinjaTemplateProcessor::applyChatTemplate expects: + // {"messages":[...], "tools":[...], "chat_template_kwargs":{...}} + std::string json = "{\"messages\":" + chatHistory.get_messages().to_json_string(); + const auto& tools = chatHistory.get_tools(); + if (!tools.empty()) { + json += ",\"tools\":" + tools.to_json_string(); + } + const auto& kwargs = chatHistory.get_extra_context(); + if (!kwargs.empty()) { + json += ",\"chat_template_kwargs\":" + kwargs.to_json_string(); + } + json += "}"; + return json; +} + +#else +ChatTemplateProcessor::ChatTemplateProcessor(ov::genai::Tokenizer& tokenizer) : + tokenizer(&tokenizer) {} +#endif + +absl::Status ChatTemplateProcessor::process(InputRequest& req) { + const ov::genai::ChatHistory& chatHistory = std::get(req.input); + +#if (PYTHON_DISABLE == 0) + if (templateProcessor.has_value()) { + const std::string jsonBody = serializeForPyJinja(chatHistory); + std::string promptText; + const bool success = PyJinjaTemplateProcessor::applyChatTemplate( + templateProcessor.value().get(), jsonBody, promptText); + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, promptText); + } + req.promptText = std::move(promptText); + } else { +#endif + constexpr bool addGenerationPrompt = true; + const auto& tools = chatHistory.get_tools(); + const auto& kwargs = chatHistory.get_extra_context(); + const std::optional optTools = + tools.empty() ? std::nullopt : std::make_optional(tools); + const std::optional optKwargs = + kwargs.empty() ? std::nullopt : std::make_optional(kwargs); + try { + req.promptText = tokenizer->apply_chat_template( + chatHistory, addGenerationPrompt, {}, optTools, optKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, + "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } +#if (PYTHON_DISABLE == 0) + } +#endif + + if (req.promptText.empty()) { + return absl::Status(absl::StatusCode::kInvalidArgument, + "Final prompt after applying chat template is empty"); + } + return absl::OkStatus(); +} + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/chat_template_processor.hpp b/src/llm/io_processing/input_processors/chat_template_processor.hpp new file mode 100644 index 0000000000..e820bbba30 --- /dev/null +++ b/src/llm/io_processing/input_processors/chat_template_processor.hpp @@ -0,0 +1,64 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include + +#include + +#include "../base_input_processor.hpp" + +#if (PYTHON_DISABLE == 0) +#include "../../py_jinja_template_processor.hpp" +#endif + +namespace ovms { + +// Applies the chat template to ChatHistory, producing req.promptText. +// Active when: input is ChatHistory variant (CHAT_COMPLETIONS and RESPONSES). +// +// Under PYTHON_DISABLE==0 two constructors select the path: +// - PyJinja constructor (takes PyJinjaTemplateProcessor&): uses the Python Jinja engine. +// - Minja constructor (tokenizer only): calls tokenizer.apply_chat_template(). +// Under PYTHON_DISABLE==1 only the native tokenizer.apply_chat_template() path exists. +class ChatTemplateProcessor : public BaseInputProcessor { +public: +#if (PYTHON_DISABLE == 0) + // PyJinja path: templateProcessor must be valid (guaranteed by non-null reference param). + ChatTemplateProcessor(ov::genai::Tokenizer& tokenizer, + PyJinjaTemplateProcessor& templateProcessor); + // Minja / native-OV path: no PyJinja processor needed. + explicit ChatTemplateProcessor(ov::genai::Tokenizer& tokenizer); +#else + explicit ChatTemplateProcessor(ov::genai::Tokenizer& tokenizer); +#endif + + absl::Status process(InputRequest& req) override; + +private: + ov::genai::Tokenizer* tokenizer; // non-owning; lifetime tied to InputProcessorContext +#if (PYTHON_DISABLE == 0) + // Present only on the PyJinja path; nullopt → use tokenizer.apply_chat_template(). + std::optional> templateProcessor; + // Serialises chatHistory to {"messages":[...], "tools":[...], "chat_template_kwargs":{...}} + // for the Python Jinja template engine. + static std::string serializeForPyJinja(const ov::genai::ChatHistory& chatHistory); +#endif +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/image_decoding_processor.cpp b/src/llm/io_processing/input_processors/image_decoding_processor.cpp new file mode 100644 index 0000000000..928122cb63 --- /dev/null +++ b/src/llm/io_processing/input_processors/image_decoding_processor.cpp @@ -0,0 +1,96 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "image_decoding_processor.hpp" + +#include +#include + +#include "../../apis/openai_api_handler.hpp" +#include "../../../logging.hpp" + +namespace ovms { + +ImageDecodingProcessor::ImageDecodingProcessor( + std::optional allowedLocalMediaPath, + std::optional> allowedMediaDomains) : + allowedLocalMediaPath(std::move(allowedLocalMediaPath)), + allowedMediaDomains(std::move(allowedMediaDomains)) {} + +absl::Status ImageDecodingProcessor::process(InputRequest& req) { + ov::genai::ChatHistory& chatHistory = std::get(req.input); + + // Injection guard: reject requests that already contain image tags to + // prevent prompt injection via pre-baked tags. + for (size_t i = 0; i < chatHistory.size(); i++) { + const auto content = chatHistory[i]["content"]; + // Check plain string content. + if (content.as_string().value_or("").find(" tag"); + } + // Check text parts within array content (multimodal messages). + if (content.is_array()) { + for (size_t j = 0; j < content.size(); j++) { + const auto part = content[j]; + if (part["type"].as_string().value_or("") == "text") { + if (part["text"].as_string().value_or("").find(" tag"); + } + } + } + } + } + + size_t imageIndex = 0; + for (size_t i = 0; i < chatHistory.size(); i++) { + const auto content = chatHistory[i]["content"]; + if (!content.is_array()) { + continue; + } + + // Accumulate image tags and text parts from a single message's content array. + std::string imageTags; + std::string textContent; + + for (size_t j = 0; j < content.size(); j++) { + const auto part = content[j]; + const auto type = part["type"].as_string().value_or(""); + + if (type == "image_url") { + const auto url = part["image_url"]["url"].as_string().value_or(""); + auto imageResult = loadImage(url, allowedLocalMediaPath, allowedMediaDomains); + if (!imageResult.ok()) { + return imageResult.status(); + } + req.inputImages.push_back(std::move(imageResult).value()); + imageTags += "\n"; + } else if (type == "text") { + if (!textContent.empty()) { + textContent += "\n"; + } + textContent += part["text"].as_string().value_or(""); + } + } + + if (!imageTags.empty() || !textContent.empty()) { + chatHistory[i]["content"] = imageTags + textContent; + } + } + + return absl::OkStatus(); +} + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/image_decoding_processor.hpp b/src/llm/io_processing/input_processors/image_decoding_processor.hpp new file mode 100644 index 0000000000..2c0c0272b8 --- /dev/null +++ b/src/llm/io_processing/input_processors/image_decoding_processor.hpp @@ -0,0 +1,40 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include + +#include "../base_input_processor.hpp" + +namespace ovms { + +// Decodes image_url content entries from ChatHistory messages into tensors and +// injects tags into message content. +// Active when: config.isVLM && input is ChatHistory variant. +class ImageDecodingProcessor : public BaseInputProcessor { +public: + ImageDecodingProcessor(std::optional allowedLocalMediaPath, + std::optional> allowedMediaDomains); + absl::Status process(InputRequest& req) override; + +private: + std::optional allowedLocalMediaPath; + std::optional> allowedMediaDomains; +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/raw_prompt_extractor.hpp b/src/llm/io_processing/input_processors/raw_prompt_extractor.hpp new file mode 100644 index 0000000000..94802c134c --- /dev/null +++ b/src/llm/io_processing/input_processors/raw_prompt_extractor.hpp @@ -0,0 +1,36 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include + +#include "../base_input_processor.hpp" + +namespace ovms { + +// Moves the raw prompt string from InputRequest::input into req.promptText. +// Active when: input is std::string variant (COMPLETIONS endpoint). +class RawPromptExtractor : public BaseInputProcessor { +public: + absl::Status process(InputRequest& req) override { + req.promptText = std::move(std::get(req.input)); + return absl::OkStatus(); + } +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/text_content_normalization_processor.cpp b/src/llm/io_processing/input_processors/text_content_normalization_processor.cpp new file mode 100644 index 0000000000..af957d5a8e --- /dev/null +++ b/src/llm/io_processing/input_processors/text_content_normalization_processor.cpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include "text_content_normalization_processor.hpp" + +#include +#include + +namespace ovms { + +absl::Status TextContentNormalizationProcessor::process(InputRequest& req) { + ov::genai::ChatHistory& chatHistory = std::get(req.input); + for (size_t i = 0; i < chatHistory.size(); i++) { + const auto content = chatHistory[i]["content"]; + if (!content.is_array()) { + continue; + } + std::string combined; + for (size_t j = 0; j < content.size(); j++) { + const auto part = content[j]; + if (part["type"].as_string().value_or("") != "text") { + continue; + } + if (!combined.empty()) { + combined += "\n"; + } + combined += part["text"].as_string().value_or(""); + } + chatHistory[i]["content"] = combined; + } + return absl::OkStatus(); +} + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/text_content_normalization_processor.hpp b/src/llm/io_processing/input_processors/text_content_normalization_processor.hpp new file mode 100644 index 0000000000..0899a13660 --- /dev/null +++ b/src/llm/io_processing/input_processors/text_content_normalization_processor.hpp @@ -0,0 +1,31 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include "../base_input_processor.hpp" + +namespace ovms { + +// Normalizes text-only content arrays in ChatHistory messages to plain strings. +// Parts are joined with "\n" for backward compatibility with LM chat templates. +// Active when: !config.isVLM && input is ChatHistory variant. +// Must run before ChatTemplateProcessor. +class TextContentNormalizationProcessor : public BaseInputProcessor { +public: + absl::Status process(InputRequest& req) override; +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/tokenization_processor.cpp b/src/llm/io_processing/input_processors/tokenization_processor.cpp new file mode 100644 index 0000000000..757497a254 --- /dev/null +++ b/src/llm/io_processing/input_processors/tokenization_processor.cpp @@ -0,0 +1,34 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#include + +#include "tokenization_processor.hpp" + +namespace ovms { + +TokenizationProcessor::TokenizationProcessor(ov::genai::Tokenizer& tokenizer, bool addSpecialTokens) : + tokenizer(&tokenizer), + addSpecialTokens(addSpecialTokens) {} + +absl::Status TokenizationProcessor::process(InputRequest& req) { + req.inputIds = tokenizer->encode(req.promptText, + ov::genai::add_special_tokens(addSpecialTokens)) + .input_ids; + return absl::OkStatus(); +} + +} // namespace ovms diff --git a/src/llm/io_processing/input_processors/tokenization_processor.hpp b/src/llm/io_processing/input_processors/tokenization_processor.hpp new file mode 100644 index 0000000000..47b3e33b92 --- /dev/null +++ b/src/llm/io_processing/input_processors/tokenization_processor.hpp @@ -0,0 +1,39 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include + +#include "../base_input_processor.hpp" + +namespace ovms { + +// Encodes req.promptText into req.inputIds using the servable tokenizer. +// Active when: all paths (LM chat, LM completions, VLM chat). For VLM the resulting +// inputIds are used for max-length checks and prompt token usage statistics only; +// the VLM pipeline tokenizes internally and does not receive inputIds. +// addSpecialTokens: false for chat path (template already added them), true for completions. +class TokenizationProcessor : public BaseInputProcessor { +public: + TokenizationProcessor(ov::genai::Tokenizer& tokenizer, bool addSpecialTokens); + absl::Status process(InputRequest& req) override; + +private: + ov::genai::Tokenizer* tokenizer; // non-owning; lifetime tied to InputProcessorContext + bool addSpecialTokens; +}; + +} // namespace ovms diff --git a/src/llm/io_processing/input_request.hpp b/src/llm/io_processing/input_request.hpp new file mode 100644 index 0000000000..af382ee400 --- /dev/null +++ b/src/llm/io_processing/input_request.hpp @@ -0,0 +1,48 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace ovms { + +// Discriminated union between chat-based and raw-prompt requests. +// ChatHistory: /v3/chat/completions and /v3/responses paths. +// Image content arrays are preserved as JsonContainer until ImageDecodingProcessor runs. +// std::string: /v3/completions path — raw prompt as received. +using InputPayload = std::variant; + +// Per-request data passed through the InputProcessor chain. +// parseRequest() calls extractInputRequest() which fills input + generationConfig in one step; +// InputProcessor fills the remaining output fields. +struct InputRequest { + InputPayload input; // set in parseRequest() + ov::genai::GenerationConfig generationConfig; // set in parseRequest() + + std::string promptText; // written by ChatTemplateProcessor / RawPromptExtractor + ov::Tensor inputIds; // written by TokenizationProcessor (all paths) + std::vector inputImages; // written by ImageDecodingProcessor + std::vector inputVideos; + std::vector inputAudios; +}; + +} // namespace ovms diff --git a/src/llm/language_model/continuous_batching/servable.cpp b/src/llm/language_model/continuous_batching/servable.cpp index 470e170a09..9087b06047 100644 --- a/src/llm/language_model/continuous_batching/servable.cpp +++ b/src/llm/language_model/continuous_batching/servable.cpp @@ -53,15 +53,15 @@ void ContinuousBatchingServable::notifyExecutorThread() { absl::Status ContinuousBatchingServable::addRequestToPipeline(std::shared_ptr& executionContext) { // Additional validation for big prompt and setting without dynamic split fuse (GenAI checks it during scheduling which is too late for us) - if (executionContext->inputIds.get_size() > properties->schedulerConfig.max_num_batched_tokens && properties->schedulerConfig.dynamic_split_fuse == false) { + if (executionContext->inputRequest.inputIds.get_size() > properties->schedulerConfig.max_num_batched_tokens && properties->schedulerConfig.dynamic_split_fuse == false) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Received request with more tokens than max_num_batch_tokens {} > {}. Without dynamic split fuse on, such request is invalid", - executionContext->inputIds.get_size(), properties->schedulerConfig.max_num_batched_tokens); - return absl::InvalidArgumentError("Input length exceeds pipeline capabilities: " + std::to_string(executionContext->inputIds.get_size()) + + executionContext->inputRequest.inputIds.get_size(), properties->schedulerConfig.max_num_batched_tokens); + return absl::InvalidArgumentError("Input length exceeds pipeline capabilities: " + std::to_string(executionContext->inputRequest.inputIds.get_size()) + " > " + std::to_string(properties->schedulerConfig.max_num_batched_tokens)); } - executionContext->generationHandle = properties->pipeline->add_request(currentRequestId++, // to be removed from API? - executionContext->inputIds, - executionContext->generationConfigBuilder->getConfig()); + executionContext->generationHandle = properties->pipeline->add_request(currentRequestId++, + executionContext->inputRequest.inputIds, + executionContext->inputRequest.generationConfig); return absl::OkStatus(); } diff --git a/src/llm/language_model/legacy/legacy_executor.cpp b/src/llm/language_model/legacy/legacy_executor.cpp index 73ac908e98..fd4e7370ef 100644 --- a/src/llm/language_model/legacy/legacy_executor.cpp +++ b/src/llm/language_model/legacy/legacy_executor.cpp @@ -35,7 +35,7 @@ void LegacyExecutor::processRequest() { } else { SPDLOG_LOGGER_TRACE(llm_executor_logger, "Generation started"); try { - requestExecutionContext->results = pipe->generate(requestExecutionContext->inputIds, requestExecutionContext->generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); + requestExecutionContext->results = pipe->generate(requestExecutionContext->inputRequest.inputIds, requestExecutionContext->inputRequest.generationConfig, requestExecutionContext->textStreamer); } catch (std::exception& e) { requestExecutionContext->success = false; SPDLOG_LOGGER_ERROR(llm_executor_logger, "LLM pipeline generation failed: {}.", e.what()); diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index 3ab86e4011..843e534b7e 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -41,6 +41,7 @@ #if (PYTHON_DISABLE == 0) #include "../../py_jinja_template_processor.hpp" #endif +#include "../../io_processing/generation_config_builder.hpp" #include "servable.hpp" namespace ovms { @@ -125,18 +126,15 @@ absl::Status LegacyServable::parseRequest(std::shared_ptrtokenizer, [](std::string) { return ov::genai::StreamingStatus::RUNNING; }); } - legacyExecutionContext->generationConfigBuilder = std::make_shared(getProperties()->baseGenerationConfig, + GenerationConfigBuilder configBuilder(getProperties()->baseGenerationConfig, getProperties()->toolParserName, getProperties()->enableToolGuidedGeneration, getProperties()->decodingMethod); - legacyExecutionContext->generationConfigBuilder->parseConfigFromRequest(legacyExecutionContext->apiHandler->getRequest()); - legacyExecutionContext->generationConfigBuilder->adjustConfigForDecodingMethod(); - try { - legacyExecutionContext->generationConfigBuilder->validateStructuredOutputConfig(getProperties()->tokenizer); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool guided generation will not be applied due to JSON schema validation failure: {}", e.what()); - legacyExecutionContext->generationConfigBuilder->unsetStructuredOutputConfig(); + auto inputRequestResult = legacyExecutionContext->apiHandler->extractInputRequest(configBuilder); + if (!inputRequestResult.ok()) { + return inputRequestResult.status(); } + legacyExecutionContext->inputRequest = std::move(*inputRequestResult); return absl::OkStatus(); } @@ -147,7 +145,7 @@ absl::Status LegacyServable::prepareInputs(std::shared_ptrinputIds); + status = validateInputComplianceWithProperties(executionContext->inputRequest.inputIds); return status; } diff --git a/src/llm/py_jinja_template_processor.cpp b/src/llm/py_jinja_template_processor.cpp index 188a3c0daa..235e374ea8 100644 --- a/src/llm/py_jinja_template_processor.cpp +++ b/src/llm/py_jinja_template_processor.cpp @@ -35,7 +35,7 @@ namespace ovms { -bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output) { +bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, const std::string& requestBody, std::string& output) { if (templateProcessor.chatTemplate == nullptr) { output = "Error: Chat template not loaded correctly, so it cannot be applied"; return false; @@ -43,7 +43,7 @@ bool PyJinjaTemplateProcessor::applyChatTemplate(PyJinjaTemplateProcessor& templ py::gil_scoped_acquire acquire; try { auto locals = py::dict("request_body"_a = requestBody, "chat_template"_a = templateProcessor.chatTemplate->getObject(), - "tool_chat_template"_a = templateProcessor.toolTemplate->getObject(), "models_path"_a = modelsPath, + "tool_chat_template"_a = templateProcessor.toolTemplate->getObject(), "bos_token"_a = templateProcessor.bosToken, "eos_token"_a = templateProcessor.eosToken); py::exec(R"( output = "" diff --git a/src/llm/py_jinja_template_processor.hpp b/src/llm/py_jinja_template_processor.hpp index 95b9e8598b..cd388bc4ee 100644 --- a/src/llm/py_jinja_template_processor.hpp +++ b/src/llm/py_jinja_template_processor.hpp @@ -36,6 +36,6 @@ class PyJinjaTemplateProcessor { std::unique_ptr> chatTemplate = nullptr; std::unique_ptr> toolTemplate = nullptr; - static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, std::string modelsPath, const std::string& requestBody, std::string& output); + static bool applyChatTemplate(PyJinjaTemplateProcessor& templateProcessor, const std::string& requestBody, std::string& output); }; } // namespace ovms diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 0d934cce0a..c0fa1ab697 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -36,6 +36,8 @@ #include "../profiler.hpp" #include "apis/openai_completions.hpp" #include "apis/openai_responses.hpp" +#include "io_processing/generation_config_builder.hpp" +#include "io_processing/input_processor.hpp" #include "ovms_text_streamer.hpp" #include "servable.hpp" #include "text_utils.hpp" @@ -158,19 +160,15 @@ absl::Status GenAiServable::parseRequest(std::shared_ptrgenerationConfigBuilder = std::make_shared(getProperties()->baseGenerationConfig, + GenerationConfigBuilder configBuilder(getProperties()->baseGenerationConfig, getProperties()->toolParserName, getProperties()->enableToolGuidedGeneration, getProperties()->decodingMethod); - executionContext->generationConfigBuilder->parseConfigFromRequest(executionContext->apiHandler->getRequest()); - executionContext->generationConfigBuilder->adjustConfigForDecodingMethod(); - try { - executionContext->generationConfigBuilder->validateStructuredOutputConfig(getProperties()->tokenizer); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool guided generation will not be applied due to JSON schema validation failure: {}", e.what()); - executionContext->generationConfigBuilder->unsetStructuredOutputConfig(); + auto inputRequestResult = executionContext->apiHandler->extractInputRequest(configBuilder); + if (!inputRequestResult.ok()) { + return inputRequestResult.status(); } - + executionContext->inputRequest = std::move(*inputRequestResult); return absl::OkStatus(); } @@ -179,130 +177,59 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getImageHistory().size() > 0) { - return absl::InternalError("This servable supports only text input, but image_url has been provided"); - } - - std::string inputText; - switch (executionContext->endpoint) { - case Endpoint::CHAT_COMPLETIONS: { -#if (PYTHON_DISABLE == 0) - if (getProperties()->chatTemplateMode == ChatTemplateMode::JINJA) { - bool success; - if (executionContext->apiHandler->getProcessedJson().size() > 0) { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - } else { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText); - } - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); - } - } else // NOLINT(readability/braces) -#endif - { - ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolParsingResult = executionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolParsingResult.ok()) { - return toolParsingResult.status(); - } - const auto& tools = toolParsingResult.value(); - auto chatTemplateKwargsParsingResult = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsParsingResult.ok()) { - return chatTemplateKwargsParsingResult.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsParsingResult.value(); - try { - inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); - } - } - if (inputText.size() == 0) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (executionContext->apiHandler->getOutputParser() != nullptr) { - executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); - } - break; - } - case Endpoint::RESPONSES: { - if (executionContext->apiHandler->getChatHistory().size() > 0) { -#if (PYTHON_DISABLE == 0) - if (getProperties()->chatTemplateMode == ChatTemplateMode::JINJA) { - bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); - } - } else // NOLINT(readability/braces) -#endif - { - ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; - auto toolParsingResult = executionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolParsingResult.ok()) { - return toolParsingResult.status(); - } - const auto& tools = toolParsingResult.value(); - auto chatTemplateKwargsParsingResult = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsParsingResult.ok()) { - return chatTemplateKwargsParsingResult.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsParsingResult.value(); - try { - inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + // LM servables reject requests containing image content. Images are preserved + // as JsonContainer arrays in chatHistory. Reject if any message's content array + // contains an image_url entry. + if (!getProperties()->inputProcessorContext.config.isVLM && + std::holds_alternative(executionContext->inputRequest.input)) { + const auto& ch = std::get(executionContext->inputRequest.input); + for (size_t i = 0; i < ch.size(); i++) { + const auto content = ch[i]["content"]; + if (content.is_array()) { + for (size_t j = 0; j < content.size(); j++) { + if (content[j]["type"].as_string().value_or("") == "image_url") { + return absl::Status(absl::StatusCode::kInvalidArgument, "This servable supports only text input, but image_url has been provided"); + } } } - if (inputText.size() == 0) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (executionContext->apiHandler->getOutputParser() != nullptr) { - executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); - } - } else { - auto prompt = executionContext->apiHandler->getPrompt(); - if (!prompt.has_value()) { - return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); - } - inputText = prompt.value(); } - break; } - case Endpoint::COMPLETIONS: { - inputText = executionContext->apiHandler->getPrompt().value(); - break; + + InputRequest& req = executionContext->inputRequest; + InputProcessor processor(getProperties()->inputProcessorContext, req); + auto status = processor.process(req); + if (!status.ok()) { + return status; } - case Endpoint::TOKENIZE: - return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage"); + + if (executionContext->apiHandler->getOutputParser() != nullptr) { + executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(req.promptText); } if (Config::instance().getServerSettings().verboseResponse) { - executionContext->apiHandler->enableVerboseResponse(inputText); + executionContext->apiHandler->enableVerboseResponse(req.promptText); } - bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS); - executionContext->inputIds = getProperties()->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; if (getProperties()->maxModelLength.has_value()) { - if (executionContext->inputIds.get_size() > getProperties()->maxModelLength.value()) { + if (req.inputIds.get_size() > getProperties()->maxModelLength.value()) { std::stringstream ss; - ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " exceeds model max length: " << getProperties()->maxModelLength.value(); - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + ss << "Number of prompt tokens: " << req.inputIds.get_size() + << " exceeds model max length: " << getProperties()->maxModelLength.value(); + SPDLOG_LOGGER_WARN(llm_calculator_logger, ss.str()); return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); } - if (executionContext->apiHandler->getMaxTokens().has_value() && executionContext->inputIds.get_size() + executionContext->apiHandler->getMaxTokens().value() > getProperties()->maxModelLength.value()) { + if (executionContext->apiHandler->getMaxTokens().has_value() && + req.inputIds.get_size() + static_cast(executionContext->apiHandler->getMaxTokens().value()) > + getProperties()->maxModelLength.value()) { std::stringstream ss; - ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " + max tokens value: " << executionContext->apiHandler->getMaxTokens().value() << " exceeds model max length: " << getProperties()->maxModelLength.value(); - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, ss.str()); + ss << "Number of prompt tokens: " << req.inputIds.get_size() + << " + max tokens value: " << executionContext->apiHandler->getMaxTokens().value() + << " exceeds model max length: " << getProperties()->maxModelLength.value(); + SPDLOG_LOGGER_WARN(llm_calculator_logger, ss.str()); return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); } } - - executionContext->apiHandler->setPromptTokensUsage(executionContext->inputIds.get_size()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Pipeline input text: {}", inputText); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "{}", getPromptTokensString(executionContext->inputIds)); + executionContext->apiHandler->setPromptTokensUsage(req.inputIds.get_size()); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "{}", getPromptTokensString(req.inputIds)); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Pipeline input text: {}", req.promptText); return absl::OkStatus(); } diff --git a/src/llm/servable.hpp b/src/llm/servable.hpp index 6d1669735a..f7a8d96f77 100644 --- a/src/llm/servable.hpp +++ b/src/llm/servable.hpp @@ -36,7 +36,9 @@ #include "../http_payload.hpp" #include "../sse_utils.hpp" #include "apis/openai_api_handler.hpp" -#include "io_processing/generation_config_builder.hpp" +#include "io_processing/base_generation_config_builder.hpp" +#include "io_processing/input_processor_context.hpp" +#include "io_processing/input_request.hpp" #if (PYTHON_DISABLE == 0) #include "py_jinja_template_processor.hpp" #endif @@ -132,10 +134,8 @@ struct GenAiServableExecutionContext { HttpPayload payload; Endpoint endpoint; std::shared_ptr apiHandler; - std::shared_ptr generationConfigBuilder; - // Single tensor with inputIds for the model. This is considered general for all pipelines, - // but depending on particular pipeline implementation it might be not required or on the other hand, insufficient. - ov::Tensor inputIds; + // Populated in parseRequest(); carries all GenAI inputs including the generation config. + InputRequest inputRequest; // Required for generating output and handle request on the calculator side std::vector generationOutputs; std::string response; @@ -180,6 +180,9 @@ struct GenAiServableProperties { ov::genai::Tokenizer tokenizer; // Specific pipeline properties bool eagle3Mode = false; + // Controls which steps InputProcessor builds for this servable type. + // Aggregated per-deployment context for InputProcessor. + InputProcessorContext inputProcessorContext; #if (PYTHON_DISABLE == 0) PyJinjaTemplateProcessor templateProcessor; @@ -227,7 +230,7 @@ class GenAiServable { virtual absl::Status parseRequest(std::shared_ptr& executionContext); /* - prepareInputs method implementation MUST fill executionContext inputIds field. + prepareInputs method implementation MUST fill executionContext inputRequest.inputIds field. Base implementation applies chat template to the payload body and encodes it with tokenizer. */ virtual absl::Status prepareInputs(std::shared_ptr& executionContext); diff --git a/src/llm/servable_initializer.cpp b/src/llm/servable_initializer.cpp index 90673fdbc3..1d09334620 100644 --- a/src/llm/servable_initializer.cpp +++ b/src/llm/servable_initializer.cpp @@ -82,6 +82,13 @@ void GenAiServableInitializer::loadChatTemplate(std::shared_ptrinputProcessorContext.tokenizer = properties->tokenizer; + properties->inputProcessorContext.config.useMinja = (properties->chatTemplateMode != ChatTemplateMode::JINJA); +#if (PYTHON_DISABLE == 0) + properties->inputProcessorContext.templateProcessor = &properties->templateProcessor; +#endif } #if (PYTHON_DISABLE == 0) diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 8b65ac7fe0..57cce69c85 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "src/port/rapidjson_document.hpp" @@ -39,8 +38,8 @@ namespace ovms { absl::Status VisualLanguageModelServable::addRequestToPipeline(std::shared_ptr& executionContext) { auto vlmExecutionContext = std::static_pointer_cast(executionContext); vlmExecutionContext->generationHandle = properties->pipeline->add_request(currentRequestId++, // to be removed from API? - vlmExecutionContext->inputText, vlmExecutionContext->inputImages, - vlmExecutionContext->generationConfigBuilder->getConfig()); + vlmExecutionContext->inputRequest.promptText, vlmExecutionContext->inputRequest.inputImages, + vlmExecutionContext->inputRequest.generationConfig); return absl::OkStatus(); } @@ -72,120 +71,4 @@ std::shared_ptr VisualLanguageModelServable::getPropert return properties; } -absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr& executionContext) { - auto vlmExecutionContext = std::static_pointer_cast(executionContext); - if (vlmExecutionContext->apiHandler == nullptr) { - return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); - } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { - ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); - - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); - } - } - - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); - } - - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; - } - -#if (PYTHON_DISABLE == 0) - if (getProperties()->chatTemplateMode == ChatTemplateMode::JINJA) { - std::string jsonForTemplate; - if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) { - jsonForTemplate = vlmExecutionContext->apiHandler->getProcessedJson(); - } else { - jsonForTemplate = vlmExecutionContext->payload.body; - } - // Inject image tags into the JSON messages for Python Jinja template processing - if (!imageTags.empty()) { - rapidjson::Document jsonDoc; - jsonDoc.Parse(jsonForTemplate.c_str()); - if (!jsonDoc.HasParseError() && jsonDoc.IsObject() && jsonDoc.HasMember("messages") && jsonDoc["messages"].IsArray()) { - auto& messages = jsonDoc["messages"]; - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - if (chatTurnIndex < messages.Size()) { - auto& msg = messages[chatTurnIndex]; - if (msg.IsObject() && msg.HasMember("content") && msg["content"].IsString()) { - std::string newContent = imageTagString + msg["content"].GetString(); - msg["content"].SetString(newContent.c_str(), newContent.length(), jsonDoc.GetAllocator()); - } - } - } - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - jsonDoc.Accept(writer); - jsonForTemplate = buffer.GetString(); - } - } - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "VLM CB: Applying chat template using Python Jinja processor"); - bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, jsonForTemplate, vlmExecutionContext->inputText); - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText); - } - } else // NOLINT(readability/braces) -#endif - { - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolParsingResult = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolParsingResult.ok()) { - return toolParsingResult.status(); - } - const auto& tools = toolParsingResult.value(); - auto chatTemplateKwargsParsingResult = vlmExecutionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsParsingResult.ok()) { - return chatTemplateKwargsParsingResult.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsParsingResult.value(); - if (llm_calculator_logger->should_log(spdlog::level::trace)) { - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory messages: {}", chatHistory.get_messages().to_json_string()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory.get_tools(): {}", chatHistory.get_tools().to_json_string()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory.get_extra_context(): {}", chatHistory.get_extra_context().to_json_string()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM tools: {}", tools.has_value() ? tools->to_json_string() : std::string("")); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatTemplateKwargs: {}", chatTemplateKwargs.has_value() ? chatTemplateKwargs->to_json_string() : std::string("")); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM addGenerationPrompt: {}", addGenerationPrompt); - } - try { - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); - } - } - if (vlmExecutionContext->inputText.empty()) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { - vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); - } - } else { - return absl::InvalidArgumentError("Unsupported endpoint"); - } - - if (Config::instance().getServerSettings().verboseResponse) { - vlmExecutionContext->apiHandler->enableVerboseResponse(vlmExecutionContext->inputText); - } - - // Below logic is used only for the statistics and debugging purposes and does not affect the model execution. - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText); - bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens - ov::Tensor inputTextIds = getProperties()->tokenizer.encode(vlmExecutionContext->inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; - vlmExecutionContext->apiHandler->setPromptTokensUsage(inputTextIds.get_size()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "{}", getPromptTokensString(inputTextIds)); - - return absl::OkStatus(); -} } // namespace ovms diff --git a/src/llm/visual_language_model/continuous_batching/servable.hpp b/src/llm/visual_language_model/continuous_batching/servable.hpp index bbbfd2c067..6ee6acf75b 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.hpp +++ b/src/llm/visual_language_model/continuous_batching/servable.hpp @@ -32,16 +32,13 @@ This servable also reuses CB servable initializer. using VisualLanguageModelServableProperties = ContinuousBatchingServableProperties; struct VisualLanguageModelServableExecutionContext : public ContinuousBatchingServableExecutionContext { - // Currently, scheduleExecution uses add_request call with prompt as std::string and images as std::vector - // so prepareInputs provides inputText and inputImages instead of inputIds from the base class. - std::vector inputImages; - std::string inputText; }; class VisualLanguageModelServable : public ContinuousBatchingServable { public: VisualLanguageModelServable() { properties = std::make_shared(); + properties->inputProcessorContext.config.isVLM = true; #if (PYTHON_DISABLE == 0) // TODO(dkalinow): once we have server-side workaround, set default back to JINJA properties->chatTemplateMode = ChatTemplateMode::MINJA; @@ -55,6 +52,5 @@ class VisualLanguageModelServable : public ContinuousBatchingServable { absl::Status loadRequest(std::shared_ptr& executionContext, const HttpPayload& payload) override; std::shared_ptr createExecutionContext() override; std::shared_ptr getProperties() override; - absl::Status prepareInputs(std::shared_ptr& executionContext) override; }; } // namespace ovms diff --git a/src/llm/visual_language_model/legacy/legacy_executor.cpp b/src/llm/visual_language_model/legacy/legacy_executor.cpp index 319550c612..795cedccd4 100644 --- a/src/llm/visual_language_model/legacy/legacy_executor.cpp +++ b/src/llm/visual_language_model/legacy/legacy_executor.cpp @@ -40,7 +40,7 @@ void VisualLanguageModelLegacyExecutor::processRequest() { } else { SPDLOG_LOGGER_TRACE(llm_executor_logger, "Generation started"); try { - requestExecutionContext->results = pipe->generate(requestExecutionContext->inputText, requestExecutionContext->inputImages, requestExecutionContext->generationConfigBuilder->getConfig(), requestExecutionContext->textStreamer); + requestExecutionContext->results = pipe->generate(requestExecutionContext->inputRequest.promptText, requestExecutionContext->inputRequest.inputImages, requestExecutionContext->inputRequest.generationConfig, requestExecutionContext->textStreamer); } catch (std::exception& e) { requestExecutionContext->success = false; SPDLOG_LOGGER_ERROR(llm_executor_logger, "VLM pipeline generation failed: {}.", e.what()); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 399682d6c5..6b0c6c50c7 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -47,6 +46,7 @@ #if (PYTHON_DISABLE == 0) #include "../../py_jinja_template_processor.hpp" #endif +#include "../../io_processing/generation_config_builder.hpp" #include "servable.hpp" namespace ovms { @@ -169,18 +169,15 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptrgenerationConfigBuilder = std::make_shared(getProperties()->baseGenerationConfig, + GenerationConfigBuilder configBuilder(getProperties()->baseGenerationConfig, getProperties()->toolParserName, getProperties()->enableToolGuidedGeneration, getProperties()->decodingMethod); - legacyExecutionContext->generationConfigBuilder->parseConfigFromRequest(legacyExecutionContext->apiHandler->getRequest()); - legacyExecutionContext->generationConfigBuilder->adjustConfigForDecodingMethod(); - try { - legacyExecutionContext->generationConfigBuilder->validateStructuredOutputConfig(getProperties()->tokenizer); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Tool guided generation will not be applied due to JSON schema validation failure: {}", e.what()); - legacyExecutionContext->generationConfigBuilder->unsetStructuredOutputConfig(); + auto inputRequestResult = legacyExecutionContext->apiHandler->extractInputRequest(configBuilder); + if (!inputRequestResult.ok()) { + return inputRequestResult.status(); } + legacyExecutionContext->inputRequest = std::move(*inputRequestResult); return absl::OkStatus(); } @@ -318,112 +315,4 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar return absl::OkStatus(); } -absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr& executionContext) { - auto vlmExecutionContext = std::static_pointer_cast(executionContext); - if (vlmExecutionContext->apiHandler == nullptr) { - return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); - } - if (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { - ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); - - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); - } - } - - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); - } - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; - } - -#if (PYTHON_DISABLE == 0) - if (getProperties()->chatTemplateMode == ChatTemplateMode::JINJA) { - std::string jsonForTemplate; - if (vlmExecutionContext->apiHandler->getProcessedJson().size() > 0) { - jsonForTemplate = vlmExecutionContext->apiHandler->getProcessedJson(); - } else { - jsonForTemplate = vlmExecutionContext->payload.body; - } - // Inject image tags into the JSON messages for Python Jinja template processing - if (!imageTags.empty()) { - rapidjson::Document jsonDoc; - jsonDoc.Parse(jsonForTemplate.c_str()); - if (!jsonDoc.HasParseError() && jsonDoc.IsObject() && jsonDoc.HasMember("messages") && jsonDoc["messages"].IsArray()) { - auto& messages = jsonDoc["messages"]; - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - if (chatTurnIndex < messages.Size()) { - auto& msg = messages[chatTurnIndex]; - if (msg.IsObject() && msg.HasMember("content") && msg["content"].IsString()) { - std::string newContent = imageTagString + msg["content"].GetString(); - msg["content"].SetString(newContent.c_str(), newContent.length(), jsonDoc.GetAllocator()); - } - } - } - rapidjson::StringBuffer buffer; - rapidjson::Writer writer(buffer); - jsonDoc.Accept(writer); - jsonForTemplate = buffer.GetString(); - } - } - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "VLM Legacy: Applying chat template using Python Jinja processor"); - bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, jsonForTemplate, vlmExecutionContext->inputText); - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, vlmExecutionContext->inputText); - } - } else // NOLINT(readability/braces) -#endif - { - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolParsingResult = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolParsingResult.ok()) { - return toolParsingResult.status(); - } - const auto& tools = toolParsingResult.value(); - auto chatTemplateKwargsParsingResult = vlmExecutionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsParsingResult.ok()) { - return chatTemplateKwargsParsingResult.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsParsingResult.value(); - try { - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); - } - } - if (vlmExecutionContext->inputText.empty()) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { - vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); - } - } else { - return absl::InvalidArgumentError("Unsupported endpoint"); - } - - if (Config::instance().getServerSettings().verboseResponse) { - vlmExecutionContext->apiHandler->enableVerboseResponse(vlmExecutionContext->inputText); - } - - // Below logic is used only for the statistics and debugging purposes and does not affect the model execution. - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM input text: {}", vlmExecutionContext->inputText); - bool encodeAddSpecialTokens = false; // assuming chat template application added special tokens - ov::Tensor inputTextIds = getProperties()->tokenizer.encode(vlmExecutionContext->inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; - vlmExecutionContext->apiHandler->setPromptTokensUsage(inputTextIds.get_size()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "{}", getPromptTokensString(inputTextIds)); - - return absl::OkStatus(); -} - } // namespace ovms diff --git a/src/llm/visual_language_model/legacy/servable.hpp b/src/llm/visual_language_model/legacy/servable.hpp index 33daf46d53..886c97c7d8 100644 --- a/src/llm/visual_language_model/legacy/servable.hpp +++ b/src/llm/visual_language_model/legacy/servable.hpp @@ -31,8 +31,6 @@ struct VisualLanguageModelLegacyServableExecutionContext : public GenAiServableE ov::genai::VLMDecodedResults results; std::promise readySignal; std::future finished = readySignal.get_future(); - std::vector inputImages; - std::string inputText; // Workaround needed to pass generation config to the executor that requires it ov::genai::GenerationConfig baseGenerationConfig; bool success{true}; @@ -64,6 +62,7 @@ class VisualLanguageModelLegacyServable : public GenAiServable { public: VisualLanguageModelLegacyServable() { properties = std::make_shared(); + properties->inputProcessorContext.config.isVLM = true; #if (PYTHON_DISABLE == 0) // TODO(dkalinow): once we have server-side workaround, set default back to JINJA properties->chatTemplateMode = ChatTemplateMode::MINJA; @@ -80,6 +79,5 @@ class VisualLanguageModelLegacyServable : public GenAiServable { absl::Status prepareCompleteResponse(std::shared_ptr& executionContext) override; absl::Status readPartialExecutionResults(std::shared_ptr& executionContext) override; absl::Status preparePartialResponse(std::shared_ptr& executionContext) override; - absl::Status prepareInputs(std::shared_ptr& executionContext) override; }; } // namespace ovms diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 3b9326bbd1..34e382cfe3 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -389,10 +389,11 @@ class HttpOpenAIHandlerParsingTest : public ::testing::Test { } void assertRequestWithTools(std::string providedTools, std::string toolsChoice, absl::StatusCode status = absl::StatusCode::kOk) { - assertRequestWithTools(providedTools, toolsChoice, "", status); + assertRequestWithTools(providedTools, toolsChoice, std::vector{}, status); } - void assertRequestWithTools(std::string providedTools, std::string toolsChoice, std::string expectedJson, absl::StatusCode status = absl::StatusCode::kOk) { + void assertRequestWithTools(std::string providedTools, std::string toolsChoice, + std::vector expectedToolNames, absl::StatusCode status = absl::StatusCode::kOk) { std::string json = R"({ "messages": [ {"role": "user", "content": "What is the weather like in Paris today?"}, @@ -416,8 +417,13 @@ class HttpOpenAIHandlerParsingTest : public ::testing::Test { std::optional maxModelLength; std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength).code(), status) << json; - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, expectedJson); + if (status == absl::StatusCode::kOk) { + const auto& toolMap = apiHandler->getRequest().toolNameSchemaMap; + EXPECT_EQ(toolMap.size(), expectedToolNames.size()); + for (const auto& name : expectedToolNames) { + EXPECT_TRUE(toolMap.count(name) > 0) << "Expected tool '" << name << "' not found in toolNameSchemaMap"; + } + } } }; @@ -763,116 +769,6 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUser ASSERT_TRUE(chatHistory[0].contains("content")); EXPECT_EQ(chatHistory[0]["role"], "user"); EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); - if (endpoint() == ovms::Endpoint::CHAT_COMPLETIONS) { - // Chat completions with simple text does not mutate the JSON, so processedJson is empty - EXPECT_TRUE(apiHandler->getProcessedJson().empty()); - } -} - -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquivalentMessages) { - std::string json = createTextRequest("What is OpenVINO?"); - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); - - // For Responses, processedJson is always built from chatHistory. - // For chat/completions with simple text, processedJson is empty (original body is used instead). - // In both cases, the chatHistory should be equivalent. - auto& chatHistory = apiHandler->getChatHistory(); - ASSERT_EQ(chatHistory.size(), 1); - EXPECT_EQ(chatHistory[0]["role"], "user"); - EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); - -#if (PYTHON_DISABLE == 0) - if (endpoint() == ovms::Endpoint::RESPONSES) { - // Responses path builds processedJson with messages array - const std::string& processedJson = apiHandler->getProcessedJson(); - ASSERT_FALSE(processedJson.empty()) << "Responses should build processedJson"; - // Verify it contains a messages array with the correct content - rapidjson::Document processedDoc; - processedDoc.Parse(processedJson.c_str()); - ASSERT_FALSE(processedDoc.HasParseError()); - ASSERT_TRUE(processedDoc.HasMember("messages")); - ASSERT_TRUE(processedDoc["messages"].IsArray()); - ASSERT_EQ(processedDoc["messages"].Size(), 1u); - EXPECT_STREQ(processedDoc["messages"][0]["role"].GetString(), "user"); - EXPECT_STREQ(processedDoc["messages"][0]["content"].GetString(), "What is OpenVINO?"); - } -#else - if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; - } -#endif -} - -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonEquivalentMultiMessage) { - // Test with array input containing multiple messages - std::string json; - if (endpoint() == ovms::Endpoint::RESPONSES) { - json = R"({"model":"llama","input":[ - {"role":"system","content":"You are helpful."}, - {"role":"user","content":"Hello"} - ]})"; - } else { - json = R"({"model":"llama","messages":[ - {"role":"system","content":"You are helpful."}, - {"role":"user","content":"Hello"} - ]})"; - } - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); - - auto& chatHistory = apiHandler->getChatHistory(); - ASSERT_EQ(chatHistory.size(), 2); - EXPECT_EQ(chatHistory[0]["role"], "system"); - EXPECT_EQ(chatHistory[0]["content"], "You are helpful."); - EXPECT_EQ(chatHistory[1]["role"], "user"); - EXPECT_EQ(chatHistory[1]["content"], "Hello"); - -#if (PYTHON_DISABLE == 0) - if (endpoint() == ovms::Endpoint::RESPONSES) { - const std::string& processedJson = apiHandler->getProcessedJson(); - ASSERT_FALSE(processedJson.empty()); - rapidjson::Document processedDoc; - processedDoc.Parse(processedJson.c_str()); - ASSERT_FALSE(processedDoc.HasParseError()); - ASSERT_TRUE(processedDoc.HasMember("messages")); - ASSERT_EQ(processedDoc["messages"].Size(), 2u); - EXPECT_STREQ(processedDoc["messages"][0]["role"].GetString(), "system"); - EXPECT_STREQ(processedDoc["messages"][0]["content"].GetString(), "You are helpful."); - EXPECT_STREQ(processedDoc["messages"][1]["role"].GetString(), "user"); - EXPECT_STREQ(processedDoc["messages"][1]["content"].GetString(), "Hello"); - } -#else - if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; - } -#endif -} - -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonIncludesToolsWhenPresent) { - std::string json = createToolRequest("\"auto\""); - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); - - EXPECT_TRUE(apiHandler->areToolsAvailable()); - -#if (PYTHON_DISABLE == 0) - if (endpoint() == ovms::Endpoint::RESPONSES) { - const std::string& processedJson = apiHandler->getProcessedJson(); - ASSERT_FALSE(processedJson.empty()); - rapidjson::Document processedDoc; - processedDoc.Parse(processedJson.c_str()); - ASSERT_FALSE(processedDoc.HasParseError()); - ASSERT_TRUE(processedDoc.HasMember("messages")); - ASSERT_TRUE(processedDoc.HasMember("tools")); - ASSERT_TRUE(processedDoc["tools"].IsArray()); - ASSERT_GT(processedDoc["tools"].Size(), 0u); - } -#else - if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; - } -#endif } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTokenLimitSetsMaxTokens) { @@ -918,7 +814,13 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultimodalInputImage auto apiHandler = parseCurrentRequest(json); ASSERT_NE(apiHandler, nullptr); - EXPECT_EQ(apiHandler->getImageHistory().size(), 1); + // Image decoding is deferred to ImageDecodingProcessor; imageHistory is no longer populated. + // The content array with image_url is preserved in chatHistory. + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1u); + auto content = chatHistory[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[1]["type"].as_string().value_or(""), "image_url"); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageJpegBase64Succeeds) { @@ -926,7 +828,9 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageJpegBase64Succe std::string json = createMultimodalRequestWithImageUrl(base64Image); auto apiHandler = parseCurrentRequest(json); ASSERT_NE(apiHandler, nullptr); - EXPECT_EQ(apiHandler->getImageHistory().size(), 1); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[1]["type"].as_string().value_or(""), "image_url"); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageOnlyNoTextSucceeds) { @@ -934,7 +838,9 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageOnlyNoTextSucce std::string json = createImageOnlyRequest(base64Image); auto apiHandler = parseCurrentRequest(json); ASSERT_NE(apiHandler, nullptr); - EXPECT_EQ(apiHandler->getImageHistory().size(), 1); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultipleImagesInOneTurnSucceeds) { @@ -943,97 +849,115 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingMultipleImagesInOneT auto apiHandler = parseCurrentRequest(json); ASSERT_NE(apiHandler, nullptr); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 3); - // All images belong to the same user turn (chat history index 0). - for (const auto& entry : imageHistory) { - EXPECT_EQ(entry.first, 0u); - } -} - -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageEmptyUrlFails) { + // Image decoding is deferred; imageHistory is no longer populated. + // The content array (1 text + 3 images) is preserved in chatHistory. + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1u); + auto content = chatHistory[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content.size(), 4u); // 1 text + 3 image_url entries + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[1]["type"].as_string().value_or(""), "image_url"); + EXPECT_EQ(content[2]["type"].as_string().value_or(""), "image_url"); + EXPECT_EQ(content[3]["type"].as_string().value_or(""), "image_url"); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageEmptyUrlPreservedInChatHistory) { + // Image loading is deferred to ImageDecodingProcessor; the handler only validates structure. + // An empty URL passes structural validation and is preserved in chatHistory. std::string json = createImageOnlyRequest(""); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json); - // Empty URL is treated as a (non-existent) local filesystem path; with no allowed_local_media_path - // configured, the loader rejects it as "Loading images from local filesystem is disabled." - EXPECT_EQ(status, absl::InvalidArgumentError("Loading images from local filesystem is disabled.")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); + EXPECT_EQ(content[0]["image_url"]["url"].as_string().value_or("MISSING"), ""); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageMalformedBase64Fails) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageMalformedBase64PreservedInChatHistory) { + // Malformed base64 is not validated at parse time; decoding happens in ImageDecodingProcessor. std::string json = createImageOnlyRequest("data:image/png;base64,NOT_BASE64!@#"); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json); - EXPECT_EQ(status, absl::InvalidArgumentError("Invalid base64 string in request")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageStringWithNoMimePrefixFails) { - // Without a "data:..." prefix the URL falls through to the local-filesystem loader, - // which is disabled by default. +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageStringWithNoMimePrefixPreservedInChatHistory) { + // Without a "data:..." prefix the URL is treated as a local filesystem path + // at decode time. The handler no longer rejects at parse; ImageDecodingProcessor handles it. std::string json = createImageOnlyRequest("iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json); - EXPECT_EQ(status, absl::InvalidArgumentError("Loading images from local filesystem is disabled.")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemSucceeds) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemPreservesUrlInChatHistory) { + // Image loading from local filesystem is deferred to ImageDecodingProcessor. + // The handler now just preserves the URL in chatHistory. const std::string imageUrl = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg"); std::string json = createImageOnlyRequest(imageUrl); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, - getGenericFullPathForSrcTest("/ovms/src/test/binaryutils")); - ASSERT_EQ(status, absl::OkStatus()); + auto apiHandler = parseCurrentRequest(json); ASSERT_NE(apiHandler, nullptr); - ASSERT_EQ(apiHandler->getImageHistory().size(), 1); - auto [index, image] = apiHandler->getImageHistory()[0]; - EXPECT_EQ(index, 0u); - EXPECT_EQ(image.get_element_type(), ov::element::u8); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); + EXPECT_FALSE(content[0]["image_url"]["url"].as_string().value_or("").empty()); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemNotWithinAllowedPathFails) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemOutsideAllowedPathPreservesUrlInChatHistory) { + // Path validation is deferred to ImageDecodingProcessor; the handler no longer rejects here. const std::string imageUrl = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg"); std::string json = createImageOnlyRequest(imageUrl); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, - getGenericFullPathForSrcTest("/ovms/src/test/llm")); - EXPECT_EQ(status, absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemPathTraversalRejected) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemPathTraversalPreservesUrlInChatHistory) { + // Path traversal validation is deferred to ImageDecodingProcessor; handler just preserves URL. std::string imageUrlWithEscape = getGenericFullPathForSrcTest("/ovms/src/test/../test/binaryutils/rgb.jpg"); std::string json = createImageOnlyRequest(imageUrlWithEscape); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, std::string("/ovms/")); - std::string expectedMessage = "Path " + imageUrlWithEscape + " escape with .. is forbidden."; - EXPECT_EQ(status, absl::InvalidArgumentError(expectedMessage.c_str())); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemPrefixPathBypassPrevented) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemPrefixPathBypassPreservesUrlInChatHistory) { + // Path validation is deferred to ImageDecodingProcessor; handler just preserves URL. const std::string allowedLocalMediaPath = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils"); const std::string siblingPrefixPath = allowedLocalMediaPath + "_private/rgb.jpg"; std::string json = createImageOnlyRequest(siblingPrefixPath); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, allowedLocalMediaPath); - EXPECT_EQ(status, absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemNonexistentPath) { - const std::string allowedPath = getGenericFullPathForSrcTest("/ovms/src/test/"); +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemNonexistentPathPreservesUrlInChatHistory) { + // File existence is checked in ImageDecodingProcessor; handler just preserves URL. const std::string imageUrl = getGenericFullPathForSrcTest("/ovms/src/test/not_existing.jpeg"); std::string json = createImageOnlyRequest(imageUrl); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, allowedPath); - EXPECT_EQ(status, absl::InvalidArgumentError("Image file parsing failed")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemSymlinkEscapeIsRejected) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystemSymlinkEscapePreservesUrlInChatHistory) { + // Symlink escape validation is deferred to ImageDecodingProcessor; handler just preserves URL. #ifdef _WIN32 - GTEST_SKIP() << "Creating filesystem symlinks on Windows requires elevated privileges and is unreliable in CI."; + GTEST_SKIP() << "Symlink tests are not reliable on Windows CI."; #else - // The allowed directory contains a symlink pointing to a sibling directory where the real - // image lives. Accessing the image through the symlink appears to be inside the allowlist, - // but its canonical location is outside it - the authorization check must resolve the - // symlink before the allowlist comparison. const std::filesystem::path realImageDir = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils"); const std::filesystem::path allowedRoot = std::filesystem::temp_directory_path() / "ovms_symlink_allowlist_test_param"; std::error_code ec; @@ -1047,43 +971,53 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageLocalFilesystem } const std::string imageUrl = (symlinkInsideAllowed / "rgb.jpg").string(); std::string json = createImageOnlyRequest(imageUrl); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, allowedRoot.string()); + auto apiHandler = parseCurrentRequest(json); std::filesystem::remove_all(allowedRoot, ec); - EXPECT_EQ(status, absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); #endif } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlNotInAllowedDomainsFails) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlNotInAllowedDomainsPreservesUrlInChatHistory) { + // Domain validation is deferred to ImageDecodingProcessor; handler just preserves URL. std::string json = createImageOnlyRequest("http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg"); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, std::nullopt, - std::vector{"wikipedia.com"}); - EXPECT_EQ(status, absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlPartialDomainMatchFails) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlPartialDomainMatchPreservesUrlInChatHistory) { + // Domain validation is deferred to ImageDecodingProcessor; handler just preserves URL. std::string json = createImageOnlyRequest("http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg"); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, std::nullopt, - std::vector{"githubusercontent.com"}); - EXPECT_EQ(status, absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlSuffixMatchAllowedDomainFails) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlSuffixMatchAllowedDomainPreservesUrlInChatHistory) { + // Domain validation is deferred to ImageDecodingProcessor; handler just preserves URL. std::string json = createImageOnlyRequest("http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg"); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, std::nullopt, - std::vector{"host.raw.githubusercontent.com"}); - EXPECT_EQ(status, absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlWildcardPatternNotSupported) { +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingImageUrlWildcardPatternPreservesUrlInChatHistory) { + // Domain validation (including wildcard rejection) is deferred to ImageDecodingProcessor. std::string json = createImageOnlyRequest("http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg"); - std::shared_ptr apiHandler; - auto status = parseCurrentRequestWithMediaAuth(apiHandler, json, std::nullopt, - std::vector{"*githubusercontent.com"}); - EXPECT_EQ(status, absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + auto content = apiHandler->getChatHistory()[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "image_url"); } INSTANTIATE_TEST_SUITE_P( @@ -1334,6 +1268,39 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; } +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesMultipleInputTextPartsPreservedAsContentArray) { + // When a single Responses input item has multiple input_text content entries, + // ChatHistorySink builds a content array with one {"type":"text"} entry per part. + // TextContentNormalizationProcessor then joins them with '\n'. + std::string json = R"({ + "model": "llama", + "input": [{ + "role": "user", + "content": [ + {"type": "input_text", "text": "First part."}, + {"type": "input_text", "text": "Second part."} + ] + }] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + // After parsing, chatHistory content is a two-element array — both text parts are preserved. + auto content = apiHandler->getChatHistory()[0]["content"]; + ASSERT_TRUE(content.is_array()); + ASSERT_EQ(content.size(), 2u); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[0]["text"].as_string().value_or(""), "First part."); + EXPECT_EQ(content[1]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[1]["text"].as_string().value_or(""), "Second part."); +} + TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsOutputText) { std::string json = R"({ "model": "llama", @@ -2369,811 +2336,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeStreamingChunkCompletionsIncludesV ASSERT_STREQ(finalChunk["__verbose"]["content"].GetString(), "Hello world"); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" - } - } - ] - } - ] - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - std::vector expectedBytes = {110, 181, 160}; - for (size_t i = 0; i < image.get_size(); i++) { - EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); - } - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttp) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"raw.githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpMultipleAllowedDomains) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"raw.githubusercontent.com", "githubusercontent.com", "google.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttps) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"raw.githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsUrlHttpsAllowedDomainAll) { - SKIP_AND_EXIT_IF_NOT_RUNNING_UNSTABLE(); // CVS-180127 - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"all"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 225792); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingImageJpegWithNoTextSucceeds) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGIy+/oREAAA//8DiQIftNKCRwAAAABJRU5ErkJggg==" - } - } - ] - } - ] - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - std::vector expectedBytes = {54, 245, 241}; - for (size_t i = 0; i < image.get_size(); i++) { - EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); - } - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageStringWithNoPrefixFails) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg==" - } - } - ] - } - ] - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Loading images from local filesystem is disabled.")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlHttpNotAllowedDomain) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"wikipedia.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlMatchAllowedDomainPartially1) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsUrlMatchAllowedDomainPartially2) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"host.raw.githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesFailsRegexNotSupported) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": "http://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/static/images/zebra.jpeg" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::vector allowedDomains = {"*githubusercontent.com"}; - ASSERT_EQ(apiHandler->parseMessages(std::nullopt, allowedDomains), absl::InvalidArgumentError("Given url does not match any allowed domain from allowed_media_domains")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystem) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + - getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg") + R"(" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test")), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAllowedPath) { - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg") + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(getGenericFullPathForSrcTest("/ovms/src/test/binaryutils")), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}]}")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemWithinAllowedPathMixedSeparators) { -#ifndef _WIN32 - GTEST_SKIP() << "Backslash is a valid filename character on POSIX and is not treated as a path separator."; -#else - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg") + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - - std::string mixedSeparatorAllowedPath = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils"); - std::replace(mixedSeparatorAllowedPath.begin(), mixedSeparatorAllowedPath.end(), '/', '\\'); - - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(mixedSeparatorAllowedPath), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); -#endif -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemNotWithinAllowedPath) { - const std::string imageUrl = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg"); - const std::string allowedPath = getGenericFullPathForSrcTest("/ovms/src/test/llm"); - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + imageUrl + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(allowedPath), absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemPrefixPathBypassPrevented) { - const std::string allowedLocalMediaPath = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils"); - const std::string siblingPrefixPath = allowedLocalMediaPath + "_private/rgb.jpg"; - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + siblingPrefixPath + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(allowedLocalMediaPath), absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemRelativeImagePathInsideAllowedPath) { - // Verify that a relative image path is resolved against the current working directory - // and accepted when the resolved location is inside allowed_local_media_path. - // Copy the fixture into cwd so the relative path is a single component (no "..", - // which FileSystem::isPathEscaped would reject before normalization). - const std::filesystem::path absoluteImage = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg"); - const std::string relativeImageName = "ovms_relative_image_test_inside.jpg"; - const std::filesystem::path relativeImageInCwd = std::filesystem::current_path() / relativeImageName; - std::error_code ec; - std::filesystem::copy_file(absoluteImage, relativeImageInCwd, std::filesystem::copy_options::overwrite_existing, ec); - ASSERT_FALSE(ec) << "Cannot copy fixture into cwd: " << ec.message(); - const std::string allowedPath = std::filesystem::current_path().generic_string(); - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + relativeImageName + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - const auto status = apiHandler->parseMessages(allowedPath); - std::filesystem::remove(relativeImageInCwd, ec); - ASSERT_EQ(status, absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemRelativeImagePathOutsideAllowedPath) { - // A relative image path resolves against the current working directory; if the resolved - // location is outside allowed_local_media_path the request must be rejected. - const std::string imageUrl = "rgb.jpg"; - const std::string allowedPath = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils"); - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + imageUrl + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(allowedPath), absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemRelativeAllowedPathInside) { - // A relative allowed_local_media_path is resolved against the current working directory. - // Use "." so allowlist resolves to cwd; copy the fixture into cwd so the (absolute) image - // path falls inside the resolved allowlist. - const std::filesystem::path absoluteImage = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg"); - const std::string relativeImageName = "ovms_relative_allowed_test_inside.jpg"; - const std::filesystem::path imageInCwd = std::filesystem::current_path() / relativeImageName; - std::error_code ec; - std::filesystem::copy_file(absoluteImage, imageInCwd, std::filesystem::copy_options::overwrite_existing, ec); - ASSERT_FALSE(ec) << "Cannot copy fixture into cwd: " << ec.message(); - const std::string imageUrl = imageInCwd.generic_string(); - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + imageUrl + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - const auto status = apiHandler->parseMessages("."); - std::filesystem::remove(imageInCwd, ec); - ASSERT_EQ(status, absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1); - auto [index, image] = imageHistory[0]; - EXPECT_EQ(index, 0); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemRelativeAllowedPathOutside) { - // A relative allowed_local_media_path resolves against the current working directory; an - // absolute image path located outside of that resolved directory must still be rejected. - const std::string allowedPath = "."; - const std::string imageUrl = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils/rgb.jpg"); - if (std::filesystem::path(imageUrl).lexically_normal().string().rfind( - std::filesystem::current_path().lexically_normal().string(), 0) == 0) { - GTEST_SKIP() << "Image path is inside the current working directory; cannot exercise the outside-of-allowlist case."; - } - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + imageUrl + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - ASSERT_EQ(apiHandler->parseMessages(allowedPath), absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemSymlinkEscapeIsRejected) { -#ifdef _WIN32 - GTEST_SKIP() << "Creating filesystem symlinks on Windows requires elevated privileges and is unreliable in CI."; -#else - // Build an allowed directory that contains a symlink pointing to a sibling directory holding the - // real image. The image, when accessed through the symlink, appears to live inside the allowlist, - // but its canonical location is outside it. This regression test ensures the authorization check - // resolves the symlink (via weakly_canonical) before the allowlist comparison. - const std::filesystem::path realImageDir = getGenericFullPathForSrcTest("/ovms/src/test/binaryutils"); - const std::filesystem::path allowedRoot = std::filesystem::temp_directory_path() / "ovms_symlink_allowlist_test"; - std::error_code ec; - std::filesystem::remove_all(allowedRoot, ec); - ASSERT_TRUE(std::filesystem::create_directories(allowedRoot, ec)) << ec.message(); - const std::filesystem::path symlinkInsideAllowed = allowedRoot / "linked"; - std::filesystem::create_directory_symlink(realImageDir, symlinkInsideAllowed, ec); - if (ec) { - std::filesystem::remove_all(allowedRoot); - GTEST_SKIP() << "Cannot create symlink for test: " << ec.message(); - } - const std::string imageUrl = (symlinkInsideAllowed / "rgb.jpg").string(); - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + imageUrl + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - const auto status = apiHandler->parseMessages(allowedRoot.string()); - std::filesystem::remove_all(allowedRoot, ec); - ASSERT_EQ(status, absl::InvalidArgumentError("Given filepath is not subpath of allowed_local_media_path")); -#endif -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemInvalidPath) { - const std::string allowedPath = getGenericFullPathForSrcTest("/ovms/src/test/"); - const std::string imageUrl = getGenericFullPathForSrcTest("/ovms/src/test/not_existing.jpeg"); - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + - imageUrl + R"(" - } - } - ] - } - ] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(allowedPath), absl::InvalidArgumentError("Image file parsing failed")); -} - -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageLocalFilesystemInvalidEscaped) { - std::string json = R"({ -"model": "llama", -"messages": [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "What is in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": ")" + getGenericFullPathForSrcTest("/ovms/src/test/../test/binaryutils/rgb.jpg") + - R"(" - } - } - ] - } -] -})"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - std::string expectedMessage = "Path " + getGenericFullPathForSrcTest("/ovms/src/test/../test/binaryutils/rgb.jpg") + " escape with .. is forbidden."; - EXPECT_EQ(apiHandler->parseMessages("/ovms/"), absl::InvalidArgumentError(expectedMessage.c_str())); -} - TEST_F(HttpOpenAIHandlerParsingTest, ParsingMultipleMessagesSucceeds) { std::string json = R"({ "model": "llama", @@ -3236,25 +2398,19 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMultipleMessagesSucceeds) { ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 2); - std::vector expectedBytes = {110, 181, 160}; - std::vector expectedImageIndexes = {0, 2}; - size_t i = 0; - for (auto [index, image] : imageHistory) { - EXPECT_EQ(index, expectedImageIndexes[i++]); - EXPECT_EQ(image.get_element_type(), ov::element::u8); - EXPECT_EQ(image.get_size(), 3); - for (size_t i = 0; i < image.get_size(); i++) { - EXPECT_EQ(expectedBytes[i], ((uint8_t*)image.data())[i]); - } - } - json = apiHandler->getProcessedJson(); - EXPECT_EQ(json, std::string("{\"model\":\"llama\",\"messages\":[{\"role\":\"user\",\"content\":\"What is in this image?\"}," - "{\"role\":\"assistant\",\"content\":\"No idea my friend.\"}," - "{\"role\":\"user\",\"content\":\"What about this one?\"}," - "{\"role\":\"assistant\",\"content\":\"Same thing. I'm not very good with images.\"}," - "{\"role\":\"user\",\"content\":\"You were not trained with images, were you?\"}]}")); + // After Phase 6: images are preserved as JsonContainer arrays; imageHistory no longer populated. + // Messages 0 and 2 (user + image) have array content preserved. + EXPECT_TRUE(apiHandler->getChatHistory()[0]["content"].is_array()); + EXPECT_EQ(apiHandler->getChatHistory()[0]["content"][1]["type"].as_string().value_or(""), "image_url"); + EXPECT_TRUE(apiHandler->getChatHistory()[2]["content"].is_array()); + EXPECT_EQ(apiHandler->getChatHistory()[2]["content"][1]["type"].as_string().value_or(""), "image_url"); + // Messages 1 and 3 (assistant, text-only array) are preserved as arrays; + // TextContentNormalizationProcessor flattens them downstream. + EXPECT_TRUE(apiHandler->getChatHistory()[1]["content"].is_array()); + EXPECT_EQ(apiHandler->getChatHistory()[1]["content"][0]["text"].as_string().value_or(""), "No idea my friend."); + EXPECT_TRUE(apiHandler->getChatHistory()[3]["content"].is_array()); + EXPECT_EQ(apiHandler->getChatHistory()[3]["content"][0]["text"].as_string().value_or(""), "Same thing. I'm not very good with images."); + // processedJson is no longer consumed by production code; omit brittle format assertions. } TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesWithInvalidContentTypeFails) { @@ -3281,69 +2437,51 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesWithInvalidContentTypeFails) EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Unsupported content type")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyImageUrlFails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyContentArrayFails) { std::string json = R"({ "model": "llama", "messages": [ { "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "" - } - } - ] + "content": [] } ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Loading images from local filesystem is disabled.")); + EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid message structure - content array is empty")); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesImageUrlNotBase64Fails) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesWithNoContentFieldAddsEmptyStringToChatHistory) { + // Assistant turns that carry only tool_calls legitimately omit the "content" field. + // parseMessages injects an empty-string content entry so chat templates always see + // the field. Verifies the JsonContainer proxy write-through: lastMessage["content"] = "" + // on the value returned by chatHistory.last() DOES persist in chatHistory. std::string json = R"({ "model": "llama", "messages": [ { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": "base64,NOTBASE64" - } - } - ] - } + "role": "assistant", + "tool_calls": [{"id": "c1", "type": "function", "function": {"name": "f", "arguments": "{}"}}] + }, + {"role": "tool", "tool_call_id": "c1", "content": "result"} ] })"; doc.Parse(json.c_str()); ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid base64 string in request")); -} + std::shared_ptr apiHandler = + std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesEmptyContentArrayFails) { - std::string json = R"({ - "model": "llama", - "messages": [ - { - "role": "user", - "content": [] - } - ] - })"; - doc.Parse(json.c_str()); - ASSERT_FALSE(doc.HasParseError()); - std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); - EXPECT_EQ(apiHandler->parseMessages(), absl::InvalidArgumentError("Invalid message structure - content array is empty")); + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 2u); + // The assistant turn had no "content" key — parseMessages must have injected "". + EXPECT_TRUE(chatHistory[0].contains("content")); + EXPECT_EQ(chatHistory[0]["content"].as_string().value_or("MISSING"), ""); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesMultipleTextItemsConcatenatesWithNewline) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesMultipleTextItemsPreservesContentArray) { std::string json = R"({ "model": "llama", "messages": [ @@ -3366,15 +2504,18 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesMultipleTextItemsConcatenate ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - // Non-Python path: chatHistory content is the concatenated string + // Content arrays are preserved; flattening is done by TextContentNormalizationProcessor downstream. auto& chatHistory = apiHandler->getChatHistory(); ASSERT_EQ(chatHistory.size(), 1); - EXPECT_EQ(chatHistory[0]["content"], "First part.\nSecond part."); - // Python Jinja path: processedJson carries the same flattened content for applyChatTemplate - EXPECT_EQ(apiHandler->getProcessedJson(), R"({"model":"llama","messages":[{"role":"user","content":"First part.\nSecond part."}]})"); + auto content = chatHistory[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[0]["text"].as_string().value_or(""), "First part."); + EXPECT_EQ(content[1]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[1]["text"].as_string().value_or(""), "Second part."); } -TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesTextBeforeAndAfterImageConcatenatesAllText) { +TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesTextAndImageMixedContentPreservesArray) { std::string json = R"({ "model": "llama", "messages": [ @@ -3403,13 +2544,17 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesTextBeforeAndAfterImageConca ASSERT_FALSE(doc.HasParseError()); std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); ASSERT_EQ(apiHandler->parseMessages(), absl::OkStatus()); - // Non-Python path: chatHistory content is the concatenated string + // Content array with images is preserved as JsonContainer for ImageDecodingProcessor. auto& chatHistory = apiHandler->getChatHistory(); ASSERT_EQ(chatHistory.size(), 1); - EXPECT_EQ(chatHistory[0]["content"], "Before image.\nAfter image."); - EXPECT_EQ(apiHandler->getImageHistory().size(), 1); - // Python Jinja path: processedJson carries the same flattened content for applyChatTemplate - EXPECT_EQ(apiHandler->getProcessedJson(), R"({"model":"llama","messages":[{"role":"user","content":"Before image.\nAfter image."}]})"); + auto content = chatHistory[0]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[0]["text"].as_string().value_or(""), "Before image."); + EXPECT_EQ(content[1]["type"].as_string().value_or(""), "image_url"); + EXPECT_EQ(content[2]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[2]["text"].as_string().value_or(""), "After image."); + // imageHistory is no longer populated at parse time; decoding deferred. } TEST_F(HttpOpenAIHandlerParsingTest, maxTokensValueDefaultToMaxTokensLimit) { @@ -3818,7 +2963,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageUrlObjectSucceeds std::shared_ptr apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); EXPECT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); - EXPECT_EQ(apiHandler->getImageHistory().size(), 1); + // Image URL is preserved as JsonContainer in chatHistory for ImageDecodingProcessor. + auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1); + EXPECT_TRUE(chatHistory[0]["content"].is_array()); + EXPECT_EQ(chatHistory[0]["content"][1]["type"].as_string().value_or(""), "image_url"); } TEST_F(HttpOpenAIHandlerParsingTest, ParsingResponsesInputImageWithoutImageUrlFails) { @@ -3979,10 +3128,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided1_ChoiceNone) {"type": "function", "function": {"name": "get_weather2", "description": "Get current temperature for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "City and country e.g. Bogot\u00e1, Colombia"}}, "required": ["location"], "additionalProperties": false}, "strict": true}} )"; std::string toolsChoice = R"("none")"; - std::string expectedJson = std::string("{\"messages\":[{\"role\":\"user\",\"content\":\"What is the weather like in Paris today?\"},{\"role\":\"assistant\",\"reasoning_content\":null,\"content\":\"\",\"tool_calls\":[{\"id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"arguments\":\"{\\\"location\\\": \\\"Paris, France\\\"}\"}}]},{\"role\":\"tool\",\"tool_call_id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"content\":\"15 degrees Celsius\"}],\"model\":\"llama\"," - "\"tool_choice\":\"none\"}"); - - assertRequestWithTools(providedTools, toolsChoice, expectedJson); + assertRequestWithTools(providedTools, toolsChoice, std::vector{}); } TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_ParsesToolsJsonContainerOnDemand) { @@ -4181,14 +3327,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceFirst {"type": "function", "function": {"name": "get_weather3", "description": "Get current temperature for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "City and country e.g. Bogot\u00e1, Colombia"}}, "required": ["location"], "additionalProperties": false}, "strict": true}} )"; std::string toolsChoice = R"({"type": "function", "function": {"name": "get_weather1"}})"; - std::string expectedJson = std::string("{\"messages\":[" - "{\"role\":\"user\",\"content\":\"What is the weather like in Paris today?\"}," - "{\"role\":\"assistant\",\"reasoning_content\":null,\"content\":\"\",\"tool_calls\":[{\"id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"arguments\":\"{\\\"location\\\": \\\"Paris, France\\\"}\"}}]}," - "{\"role\":\"tool\",\"tool_call_id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"content\":\"15 degrees Celsius\"}],\"model\":\"llama\"," - "\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_weather1\",\"description\":\"Get current temperature for a given location.\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\",\"description\":\"City and country e.g. Bogot\xC3\xA1, Colombia\"}},\"required\":[\"location\"],\"additionalProperties\":false},\"strict\":true}}]," - "\"tool_choice\":{\"type\":\"function\",\"function\":{\"name\":\"get_weather1\"}}}"); - - assertRequestWithTools(providedTools, toolsChoice, expectedJson); + assertRequestWithTools(providedTools, toolsChoice, std::vector{"get_weather1"}); } // Provide get_weather1, get_weather2, get_weather3 but take only second one - get_weather2 @@ -4199,14 +3338,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceMiddl {"type": "function", "function": {"name": "get_weather3", "description": "Get current temperature for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "City and country e.g. Bogot\u00e1, Colombia"}}, "required": ["location"], "additionalProperties": false}, "strict": true}} )"; std::string toolsChoice = R"({"type": "function", "function": {"name": "get_weather2"}})"; - std::string expectedJson = std::string("{\"messages\":[" - "{\"role\":\"user\",\"content\":\"What is the weather like in Paris today?\"}," - "{\"role\":\"assistant\",\"reasoning_content\":null,\"content\":\"\",\"tool_calls\":[{\"id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"arguments\":\"{\\\"location\\\": \\\"Paris, France\\\"}\"}}]}," - "{\"role\":\"tool\",\"tool_call_id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"content\":\"15 degrees Celsius\"}],\"model\":\"llama\"," - "\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"description\":\"Get current temperature for a given location.\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\",\"description\":\"City and country e.g. Bogot\xC3\xA1, Colombia\"}},\"required\":[\"location\"],\"additionalProperties\":false},\"strict\":true}}]," - "\"tool_choice\":{\"type\":\"function\",\"function\":{\"name\":\"get_weather2\"}}}"); - - assertRequestWithTools(providedTools, toolsChoice, expectedJson); + assertRequestWithTools(providedTools, toolsChoice, std::vector{"get_weather2"}); } // Provide get_weather1, get_weather2, get_weather3 but take only second one - get_weather2 @@ -4217,14 +3349,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceLast) {"type": "function", "function": {"name": "get_weather3", "description": "Get current temperature for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "City and country e.g. Bogot\u00e1, Colombia"}}, "required": ["location"], "additionalProperties": false}, "strict": true}} )"; std::string toolsChoice = R"({"type": "function", "function": {"name": "get_weather3"}})"; - std::string expectedJson = std::string("{\"messages\":[" - "{\"role\":\"user\",\"content\":\"What is the weather like in Paris today?\"}," - "{\"role\":\"assistant\",\"reasoning_content\":null,\"content\":\"\",\"tool_calls\":[{\"id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"arguments\":\"{\\\"location\\\": \\\"Paris, France\\\"}\"}}]}," - "{\"role\":\"tool\",\"tool_call_id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"content\":\"15 degrees Celsius\"}],\"model\":\"llama\"," - "\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_weather3\",\"description\":\"Get current temperature for a given location.\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\",\"description\":\"City and country e.g. Bogot\xC3\xA1, Colombia\"}},\"required\":[\"location\"],\"additionalProperties\":false},\"strict\":true}}]," - "\"tool_choice\":{\"type\":\"function\",\"function\":{\"name\":\"get_weather3\"}}}"); - - assertRequestWithTools(providedTools, toolsChoice, expectedJson); + assertRequestWithTools(providedTools, toolsChoice, std::vector{"get_weather3"}); } // Provide get_weather1, get_weather2, get_weather3 but take one - get_weather4 which does not exist @@ -4236,14 +3361,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceNotIn {"type": "function", "function": {"name": "get_weather3", "description": "Get current temperature for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "City and country e.g. Bogot\u00e1, Colombia"}}, "required": ["location"], "additionalProperties": false}, "strict": true}} )"; std::string toolsChoice = R"({"type": "function", "function": {"name": "get_weather4"}})"; - std::string expectedJson = std::string("{\"messages\":[" - "{\"role\":\"user\",\"content\":\"What is the weather like in Paris today?\"}," - "{\"role\":\"assistant\",\"reasoning_content\":null,\"content\":\"\",\"tool_calls\":[{\"id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"arguments\":\"{\\\"location\\\": \\\"Paris, France\\\"}\"}}]}," - "{\"role\":\"tool\",\"tool_call_id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"content\":\"15 degrees Celsius\"}],\"model\":\"llama\"," - "\"tools\":[]," - "\"tool_choice\":{\"type\":\"function\",\"function\":{\"name\":\"get_weather4\"}}}"); - - assertRequestWithTools(providedTools, toolsChoice, expectedJson); + assertRequestWithTools(providedTools, toolsChoice, std::vector{}); } // Provide get_weather1, get_weather2, get_weather3 but tool_choice is not of type function @@ -4255,14 +3373,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceIsNot {"type": "function", "function": {"name": "get_weather3", "description": "Get current temperature for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "City and country e.g. Bogot\u00e1, Colombia"}}, "required": ["location"], "additionalProperties": false}, "strict": true}} )"; std::string toolsChoice = R"({"type": "INVALID_TYPE", "function": {"name": "get_weather3"}})"; - std::string expectedJson = std::string("{\"messages\":[" - "{\"role\":\"user\",\"content\":\"What is the weather like in Paris today?\"}," - "{\"role\":\"assistant\",\"reasoning_content\":null,\"content\":\"\",\"tool_calls\":[{\"id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"arguments\":\"{\\\"location\\\": \\\"Paris, France\\\"}\"}}]}," - "{\"role\":\"tool\",\"tool_call_id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"content\":\"15 degrees Celsius\"}],\"model\":\"llama\"," - "\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_weather3\",\"description\":\"Get current temperature for a given location.\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\",\"description\":\"City and country e.g. Bogot\xC3\xA1, Colombia\"}},\"required\":[\"location\"],\"additionalProperties\":false},\"strict\":true}}]," - "\"tool_choice\":{\"type\":\"INVALID_TYPE\",\"function\":{\"name\":\"get_weather3\"}}}"); - - assertRequestWithTools(providedTools, toolsChoice, expectedJson); + assertRequestWithTools(providedTools, toolsChoice, std::vector{"get_weather3"}); } // Provide get_weather1, get_weather2, get_weather3 but tool_choice is not an object, string but a number @@ -4285,12 +3396,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, ParseRequestWithTools_Provided3_ChoiceIsASt {"type": "function", "function": {"name": "get_weather3", "description": "Get current temperature for a given location.", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "City and country e.g. Bogot\u00e1, Colombia"}}, "required": ["location"], "additionalProperties": false}, "strict": true}} )"; std::string toolsChoice = "\"get_weather1\""; - std::string expectedJson = std::string("{\"messages\":[" - "{\"role\":\"user\",\"content\":\"What is the weather like in Paris today?\"}," - "{\"role\":\"assistant\",\"reasoning_content\":null,\"content\":\"\",\"tool_calls\":[{\"id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"type\":\"function\",\"function\":{\"name\":\"get_weather2\",\"arguments\":\"{\\\"location\\\": \\\"Paris, France\\\"}\"}}]}," - "{\"role\":\"tool\",\"tool_call_id\":\"chatcmpl-tool-d39b13c90f9b4d48b08c16455553dbec\",\"content\":\"15 degrees Celsius\"}],\"model\":\"llama\"," - "\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_weather1\",\"description\":\"Get current temperature for a given location.\",\"parameters\":{\"type\":\"object\",\"properties\":{\"location\":{\"type\":\"string\",\"description\":\"City and country e.g. Bogot\xC3\xA1, Colombia\"}},\"required\":[\"location\"],\"additionalProperties\":false},\"strict\":true}}]," - "\"tool_choice\":{\"type\":\"function\",\"function\":{\"name\":\"get_weather1\"}}}"); assertRequestWithTools(providedTools, toolsChoice, absl::StatusCode::kInvalidArgument); } @@ -4988,8 +4093,7 @@ absl::Status tryParseResponses(rapidjson::Document& doc, ov::genai::Tokenizer& t } // Assert that parsing the given Responses API request produces a chat history -// (and processedJson, when Python is enabled) equivalent to the expected -// chat/completions request. +// equivalent to the expected chat/completions request. // // The expected JSON is a chat/completions REQUEST body — an object with a // "messages" array and optionally a "tools" array. This makes each test read as @@ -5000,9 +4104,7 @@ absl::Status tryParseResponses(rapidjson::Document& doc, ov::genai::Tokenizer& t // Comparison is structural via rapidjson Value::operator== (member order inside // objects is irrelevant). // -// Both the chat-history path (used in the C++/non-Python build) and the -// processedJson path (used by the Python Jinja template) are checked, so a -// single test pins both downstream consumers. +// The chat-history path (used by the C++ ChatTemplateProcessor) is checked. void expectResponsesEquivalentToChatCompletions(rapidjson::Document& doc, ov::genai::Tokenizer& tokenizer, const std::string& responsesRequest, const std::string& expectedChatCompletions) { auto handler = parseResponses(doc, tokenizer, responsesRequest); @@ -5038,24 +4140,6 @@ void expectResponsesEquivalentToChatCompletions(rapidjson::Document& doc, ov::ge << "parseToolsToJsonContainer mismatch.\n actual: " << actualToolsJson << "\n expected: " << expectedChatCompletions; } - -#if (PYTHON_DISABLE == 0) - // --- processedJson path (Python Jinja chat template) --- - const std::string actualProcessedJson = handler->getProcessedJson(); - rapidjson::Document actualProcessedDoc; - actualProcessedDoc.Parse(actualProcessedJson.c_str()); - ASSERT_FALSE(actualProcessedDoc.HasParseError()) << actualProcessedJson; - ASSERT_TRUE(actualProcessedDoc.HasMember("messages")) << actualProcessedJson; - EXPECT_TRUE(actualProcessedDoc["messages"] == expectedDoc["messages"]) - << "processedJson messages mismatch.\n actual: " << actualProcessedJson - << "\n expected: " << expectedChatCompletions; - if (expectedDoc.HasMember("tools")) { - ASSERT_TRUE(actualProcessedDoc.HasMember("tools")) << actualProcessedJson; - EXPECT_TRUE(actualProcessedDoc["tools"] == expectedDoc["tools"]) - << "processedJson tools mismatch.\n actual: " << actualProcessedJson - << "\n expected: " << expectedChatCompletions; - } -#endif } } // namespace @@ -5063,7 +4147,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolsNormaliseToChatCompletion // Responses-flat tools shape ({type, name, parameters}) must be rewritten // to chat/completions nested shape ({type, function:{...}}) before the // request is forwarded to the chat template. Input is given as an array so - // both ChatHistory and processedJson sinks populate the messages array. + // ChatHistorySink populates the messages array. expectResponsesEquivalentToChatCompletions(doc, *tokenizer, R"({ "model": "llama", @@ -5076,7 +4160,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolsNormaliseToChatCompletion }] })", R"({ - "messages": [{"role":"user","content":"hello"}], + "messages": [{"role":"user","content":[{"type":"text","text":"hello"}]}], "tools": [{ "type":"function", "function":{ @@ -5127,8 +4211,8 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesReasoningBufferedOntoNextAssistant })", R"({ "messages": [ - {"role":"user","content":"hi"}, - {"role":"assistant","content":"hello","reasoning_content":"think first"} + {"role":"user","content":[{"type":"text","text":"hi"}]}, + {"role":"assistant","content":[{"type":"text","text":"hello"}],"reasoning_content":"think first"} ] })"); } @@ -5149,9 +4233,9 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesStandaloneReasoningWithoutAssistan })", R"({ "messages": [ - {"role":"user","content":"hi"}, + {"role":"user","content":[{"type":"text","text":"hi"}]}, {"role":"assistant","content":"","reasoning_content":"orphan"}, - {"role":"user","content":"again"} + {"role":"user","content":[{"type":"text","text":"again"}]} ] })"); } @@ -5169,7 +4253,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesTrailingStandaloneReasoningIsEmitt })", R"({ "messages": [ - {"role":"user","content":"hi"}, + {"role":"user","content":[{"type":"text","text":"hi"}]}, {"role":"assistant","content":"","reasoning_content":"trailing"} ] })"); @@ -5194,7 +4278,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFunctionCallMergedIntoAssistantToo })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"","tool_calls":[ {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} ]}, @@ -5219,7 +4303,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesReasoningPlusFunctionCallRidesOnAs })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"","reasoning_content":"need to call get_weather","tool_calls":[ {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} ]}, @@ -5245,7 +4329,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesMultipleFunctionCallsMergedInOneAs })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"","tool_calls":[ {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}}, {"id":"call_2","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"London\"}"}} @@ -5270,7 +4354,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesTrailingFunctionCallFlushedAsAssis })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"","tool_calls":[ {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} ]} @@ -5294,7 +4378,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesAssistantMessageAbsorbsBufferedFun })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"calling tool","tool_calls":[ {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} ]} @@ -5316,7 +4400,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesReasoningContentArrayShapeAccepted })", R"({ "messages": [ - {"role":"user","content":"hi"}, + {"role":"user","content":[{"type":"text","text":"hi"}]}, {"role":"assistant","content":"ok","reasoning_content":"new shape"} ] })"); @@ -5337,7 +4421,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFunctionCallOutputWithoutCallIdAcc })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"","tool_calls":[ {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{}"}} ]}, @@ -5365,7 +4449,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFunctionCallPrefersCallIdOverId) { })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"","tool_calls":[ {"id":"call_xyz","type":"function","function":{"name":"get_weather","arguments":"{}"}} ]}, @@ -5388,7 +4472,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFunctionCallOnlyCallIdSupplied) { })", R"({ "messages": [ - {"role":"user","content":"weather?"}, + {"role":"user","content":[{"type":"text","text":"weather?"}]}, {"role":"assistant","content":"","tool_calls":[ {"id":"call_xyz","type":"function","function":{"name":"get_weather","arguments":"{}"}} ]}, @@ -5433,14 +4517,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFunctionCallMissingArgumentsReject EXPECT_EQ(status, absl::InvalidArgumentError("function_call item is missing required arguments field")); } -TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageHistoryIndexMatchesChatHistoryTurn) { - // Regression test for the image-index drift bug: when an input item is - // merged (function_call buffered, then absorbed into the next assistant - // message), the Responses input-array index no longer matches the - // resulting chatHistory index. ChatHistorySink::appendInputImage must - // record the actual chatHistory turn index so the VLM servable can - // prepend the tag to the correct message (and not - // index out-of-bounds). +TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageChatHistoryIndexMatchesAfterFunctionCallMerge) { + // Regression test: when a function_call item is buffered and merged into the next + // assistant message, the image-bearing user item must land at chatHistory[2] (not + // the input-array index 3, which would be out-of-bounds). After Phase 6, verify + // the image URL is preserved in the correct chatHistory slot. const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; std::string json = R"({ "model": "llama", @@ -5466,13 +4547,12 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageHistoryIndexMatchesChatHistor auto& chatHistory = apiHandler->getChatHistory(); ASSERT_EQ(chatHistory.size(), 3u); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 1u); - auto [turnIndex, image] = imageHistory[0]; - // Must point at the second user (chatHistory[2]) — NOT the input-array - // index 3, which would be out-of-bounds for chatHistory[3]. - EXPECT_EQ(turnIndex, 2u); - EXPECT_LT(turnIndex, chatHistory.size()); + // After Phase 6: imageHistory is empty; image URL is preserved in chatHistory[2]["content"]. + // Verify the image landed at the correct chatHistory slot (not out-of-bounds). + auto content = chatHistory[2]["content"]; + EXPECT_TRUE(content.is_array()); + EXPECT_EQ(content[0]["type"].as_string().value_or(""), "text"); + EXPECT_EQ(content[1]["type"].as_string().value_or(""), "image_url"); } // --- Tools normalisation edge cases --- @@ -5481,7 +4561,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolWithoutParametersIsNormali // Flat Responses tools may omit `parameters` for zero-arg functions. The // nested form should still be produced (with no `parameters` key under // function), not fail or fabricate one. Input is given as an array so - // both ChatHistory and processedJson sinks populate the messages array. + // ChatHistorySink populates the messages array. expectResponsesEquivalentToChatCompletions(doc, *tokenizer, R"({ "model": "llama", @@ -5489,7 +4569,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolWithoutParametersIsNormali "tools": [{"type": "function", "name": "ping", "description": "no args"}] })", R"({ - "messages": [{"role":"user","content":"hello"}], + "messages": [{"role":"user","content":[{"type":"text","text":"hello"}]}], "tools": [{"type":"function","function":{"name":"ping","description":"no args"}}] })"); } @@ -5579,12 +4659,12 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesMultiTurnReasoningFunctionCallAndF })", R"({ "messages": [ - {"role":"user","content":"weather in Paris?"}, + {"role":"user","content":[{"type":"text","text":"weather in Paris?"}]}, {"role":"assistant","content":"","reasoning_content":"need to call get_weather","tool_calls":[ {"id":"call_1","type":"function","function":{"name":"get_weather","arguments":"{\"city\":\"Paris\"}"}} ]}, {"role":"tool","tool_call_id":"call_1","content":"sunny, 22C"}, - {"role":"assistant","content":"It is sunny and 22C in Paris.","reasoning_content":"format the answer"} + {"role":"assistant","content":[{"type":"text","text":"It is sunny and 22C in Paris."}],"reasoning_content":"format the answer"} ] })"); } @@ -5597,7 +4677,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesMultiTurnReasoningFunctionCallAndF // The 4th request OVMS sees while running BFCL multi_turn_base_0 reports 128 // MORE input_tokens on /responses than the equivalent /chat/completions call, // even though the message lists are structurally equivalent. This test -// reproduces the exact shape so processedJson can be compared head-to-head. +// reproduces the exact shape so chatHistory can be compared head-to-head. TEST_F(HttpOpenAIHandlerParsingTest, ResponsesBfclReplayShapeWithEchoedAssistantMessages) { expectResponsesEquivalentToChatCompletions(doc, *tokenizer, R"({ @@ -5623,11 +4703,11 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesBfclReplayShapeWithEchoedAssistant R"({ "messages": [ {"role":"user","content":"do work"}, - {"role":"assistant","content":"","tool_calls":[ + {"role":"assistant","content":[{"type":"text","text":""}],"tool_calls":[ {"id":"fc1","type":"function","function":{"name":"mkdir","arguments":"{\"dir_name\":\"temp\"}"}} ]}, {"role":"tool","tool_call_id":"fc1","content":"None"}, - {"role":"assistant","content":"","tool_calls":[ + {"role":"assistant","content":[{"type":"text","text":""}],"tool_calls":[ {"id":"fc2","type":"function","function":{"name":"mv","arguments":"{\"source\":\"a\",\"destination\":\"temp\"}"}} ]}, {"role":"tool","tool_call_id":"fc2","content":"{\"error\":\"no\"}"} @@ -5689,16 +4769,23 @@ class HttpOpenAIHandlerResponsesImageUrlShapeTest : public HttpOpenAIHandlerPars }; TEST_P(HttpOpenAIHandlerResponsesImageUrlShapeTest, ValidBase64ImageSucceeds) { + // Image decoding is deferred to ImageDecodingProcessor; imageHistory is no longer populated. const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; std::string content = "[" + formatInputImageItem(base64Image) + "]"; std::string json = createResponsesRequest({content}); auto apiHandler = parseResponses(json); ASSERT_NE(apiHandler, nullptr); - ASSERT_EQ(apiHandler->getImageHistory().size(), 1); - EXPECT_EQ(apiHandler->getImageHistory()[0].first, 0u); + // The image_url is preserved in chatHistory for ImageDecodingProcessor. + const auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 1u); + auto msgContent = chatHistory[0]["content"]; + EXPECT_TRUE(msgContent.is_array()); + EXPECT_EQ(msgContent[0]["type"].as_string().value_or(""), "image_url"); } -TEST_P(HttpOpenAIHandlerResponsesImageUrlShapeTest, MultipleImagesAcrossTurnsHaveCorrectIndices) { +TEST_P(HttpOpenAIHandlerResponsesImageUrlShapeTest, MultipleImagesAcrossTurnsHaveCorrectChatHistoryContent) { + // Image decoding is deferred; imageHistory is no longer populated. + // Verify instead that both image-bearing turns have array content with image_url entries. const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; // Turn 0 (user): image + text // Turn 1 (assistant) - skipped by the helper; we'll inline-build @@ -5710,10 +4797,14 @@ TEST_P(HttpOpenAIHandlerResponsesImageUrlShapeTest, MultipleImagesAcrossTurnsHav R"(]})"; auto apiHandler = parseResponses(json); ASSERT_NE(apiHandler, nullptr); - const ovms::ImageHistory& imageHistory = apiHandler->getImageHistory(); - ASSERT_EQ(imageHistory.size(), 2); - EXPECT_EQ(imageHistory[0].first, 0u); - EXPECT_EQ(imageHistory[1].first, 2u); + const auto& chatHistory = apiHandler->getChatHistory(); + ASSERT_EQ(chatHistory.size(), 3u); // user, assistant, user + auto content0 = chatHistory[0]["content"]; + EXPECT_TRUE(content0.is_array()); + EXPECT_EQ(content0[0]["type"].as_string().value_or(""), "image_url"); + auto content2 = chatHistory[2]["content"]; + EXPECT_TRUE(content2.is_array()); + EXPECT_EQ(content2[0]["type"].as_string().value_or(""), "image_url"); } INSTANTIATE_TEST_SUITE_P( diff --git a/src/test/llm/input_processing/image_decoding_processor_test.cpp b/src/test/llm/input_processing/image_decoding_processor_test.cpp new file mode 100644 index 0000000000..9d1178fc2f --- /dev/null +++ b/src/test/llm/input_processing/image_decoding_processor_test.cpp @@ -0,0 +1,203 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include + +#include +#include + +#include "../../../llm/io_processing/input_processors/image_decoding_processor.hpp" +#include "../../../llm/io_processing/input_request.hpp" + +using namespace ovms; + +// Helpers ---------------------------------------------------------------- + +static InputRequest makeChatRequest(ov::genai::ChatHistory chatHistory) { + InputRequest req; + req.input = std::move(chatHistory); + return req; +} + +// Tests ------------------------------------------------------------------ + +TEST(ImageDecodingProcessorTest, NoImagesInTextOnlyMessage) { + ov::genai::ChatHistory history; + history.push_back({{"role", "user"}, {"content", "Hello, world!"}}); + + InputRequest req = makeChatRequest(history); + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + + EXPECT_TRUE(status.ok()); + EXPECT_TRUE(req.inputImages.empty()); + // Content unchanged + const auto& resultHistory = std::get(req.input); + EXPECT_EQ(resultHistory[0]["content"].as_string().value_or(""), "Hello, world!"); +} + +TEST(ImageDecodingProcessorTest, InjectionGuardBlocksPreexistingTag) { + ov::genai::ChatHistory history; + history.push_back({{"role", "user"}, {"content", "\nsome text"}}); + + InputRequest req = makeChatRequest(history); + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); +} + +TEST(ImageDecodingProcessorTest, InjectionGuardBlocksTagInMiddleOfContent) { + ov::genai::ChatHistory history; + history.push_back({{"role", "user"}, {"content", "prefix suffix"}}); + + InputRequest req = makeChatRequest(history); + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); +} + +TEST(ImageDecodingProcessorTest, SkipsMessagesWithNonArrayContent) { + ov::genai::ChatHistory history; + history.push_back({{"role", "system"}, {"content", "You are helpful."}}); + history.push_back({{"role", "user"}, {"content", "What is OpenVINO?"}}); + + InputRequest req = makeChatRequest(history); + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + + EXPECT_TRUE(status.ok()); + EXPECT_TRUE(req.inputImages.empty()); +} + +TEST(ImageDecodingProcessorTest, InjectionGuardBlocksTagInArrayTextPart) { + // A multimodal message where a text part embeds the restricted tag. + // Without the array-aware guard this would bypass the check. + ov::genai::ChatHistory history; + ov::AnyMap msg; + msg["role"] = std::string("user"); + ov::genai::JsonContainer contentArray = ov::genai::JsonContainer::from_json_string( + R"([{"type":"text","text":"look at this tag"}])"); + msg["content"] = contentArray; + history.push_back(msg); + + InputRequest req = makeChatRequest(history); + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); +} + +TEST(ImageDecodingProcessorTest, MultipleTextPartsJoinedWithNewline) { + // Two text parts in a single message's content array should be joined with \n. + ov::genai::ChatHistory history; + ov::AnyMap msg; + msg["role"] = std::string("user"); + ov::genai::JsonContainer contentArray = ov::genai::JsonContainer::from_json_string( + R"([{"type":"text","text":"Before image."},{"type":"text","text":"After image."}])"); + msg["content"] = contentArray; + history.push_back(msg); + + InputRequest req = makeChatRequest(history); + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + + ASSERT_TRUE(status.ok()); + const auto& resultHistory = std::get(req.input); + EXPECT_EQ(resultHistory[0]["content"].as_string().value_or(""), "Before image.\nAfter image."); +} + +// --- URL / path validation tests ----------------------------------------- + +// Helper: build a chat request whose single message has a content array with +// one image_url part pointing at the given URL. +static InputRequest makeImageUrlRequest(const std::string& url) { + std::string contentJson = + R"([{"type":"image_url","image_url":{"url":")" + url + R"("}}])"; + ov::genai::ChatHistory history; + ov::AnyMap msg; + msg["role"] = std::string("user"); + msg["content"] = ov::genai::JsonContainer::from_json_string(contentJson); + history.push_back(msg); + return makeChatRequest(history); +} + +TEST(ImageDecodingProcessorTest, Base64InvalidDataRejected) { + InputRequest req = makeImageUrlRequest("data:image/jpeg;base64,NOT_VALID_BASE64!!!"); + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_EQ(status.message(), "Invalid base64 string in request"); +} + +TEST(ImageDecodingProcessorTest, HttpUrlDomainNotInAllowList) { + InputRequest req = makeImageUrlRequest("http://evil.com/image.jpg"); + ImageDecodingProcessor processor(std::nullopt, std::vector{"safe.com"}); + const auto status = processor.process(req); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_EQ(status.message(), "Given url does not match any allowed domain from allowed_media_domains"); +} + +TEST(ImageDecodingProcessorTest, HttpUrlWithNoAllowedDomainsConfiguredRejected) { + InputRequest req = makeImageUrlRequest("http://any.com/image.jpg"); + // No allowed domains configured — all HTTP URLs must be rejected. + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_EQ(status.message(), "Given url does not match any allowed domain from allowed_media_domains"); +} + +TEST(ImageDecodingProcessorTest, LocalFilesystemDisabledRejected) { + InputRequest req = makeImageUrlRequest("/some/image.png"); + // No allowedLocalMediaPath configured — local filesystem access is disabled. + ImageDecodingProcessor processor(std::nullopt, std::nullopt); + const auto status = processor.process(req); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_EQ(status.message(), "Loading images from local filesystem is disabled."); +} + +TEST(ImageDecodingProcessorTest, LocalPathTraversalWithDotDotRejected) { + InputRequest req = makeImageUrlRequest("../escape/image.png"); + ImageDecodingProcessor processor(std::string("/allowed/path"), std::nullopt); + const auto status = processor.process(req); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); +} + +TEST(ImageDecodingProcessorTest, LocalPathOutsideAllowedDirectoryRejected) { + InputRequest req = makeImageUrlRequest("/outside/image.png"); + ImageDecodingProcessor processor(std::string("/allowed/path"), std::nullopt); + const auto status = processor.process(req); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_EQ(status.message(), "Given filepath is not subpath of allowed_local_media_path"); +} + +TEST(ImageDecodingProcessorTest, LocalPathInsideAllowedDirectoryButFileNotFound) { + InputRequest req = makeImageUrlRequest("/allowed/path/nonexistent.png"); + ImageDecodingProcessor processor(std::string("/allowed/path"), std::nullopt); + const auto status = processor.process(req); + EXPECT_FALSE(status.ok()); + EXPECT_EQ(status.code(), absl::StatusCode::kInvalidArgument); + EXPECT_EQ(status.message(), "Image file parsing failed"); +} diff --git a/src/test/llm/input_processing/raw_prompt_extractor_test.cpp b/src/test/llm/input_processing/raw_prompt_extractor_test.cpp new file mode 100644 index 0000000000..942bdcc689 --- /dev/null +++ b/src/test/llm/input_processing/raw_prompt_extractor_test.cpp @@ -0,0 +1,51 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include + +#include +#include + +#include "../../../llm/io_processing/input_processors/raw_prompt_extractor.hpp" +#include "../../../llm/io_processing/input_request.hpp" + +using namespace ovms; + +TEST(RawPromptExtractorTest, MovesPromptToPromptText) { + InputRequest req; + req.input = std::string("Explain quantum computing."); + + RawPromptExtractor extractor; + const auto status = extractor.process(req); + + EXPECT_TRUE(status.ok()); + EXPECT_EQ(req.promptText, "Explain quantum computing."); +} + +TEST(RawPromptExtractorTest, EmptyStringProducesEmptyPromptText) { + InputRequest req; + req.input = std::string(""); + + RawPromptExtractor extractor; + const auto status = extractor.process(req); + + EXPECT_TRUE(status.ok()); + EXPECT_TRUE(req.promptText.empty()); +} + +// Note: tests for ChatTemplateProcessor and TokenizationProcessor that require +// an actual tokenizer are covered by integration tests in +// src/test/llm/input_processing/chat_template_processor_test.cpp and +// src/test/llm/input_processing/tokenization_processor_test.cpp. diff --git a/src/test/llm/input_processing/text_content_normalization_processor_test.cpp b/src/test/llm/input_processing/text_content_normalization_processor_test.cpp new file mode 100644 index 0000000000..a1b814ab7a --- /dev/null +++ b/src/test/llm/input_processing/text_content_normalization_processor_test.cpp @@ -0,0 +1,99 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#include + +#include +#include + +#include "../../../llm/io_processing/input_processors/text_content_normalization_processor.hpp" +#include "../../../llm/io_processing/input_request.hpp" + +using namespace ovms; + +// Helpers ---------------------------------------------------------------- + +static InputRequest makeChatRequest(ov::genai::ChatHistory chatHistory) { + InputRequest req; + req.input = std::move(chatHistory); + return req; +} + +// Tests ------------------------------------------------------------------ + +TEST(TextContentNormalizationProcessorTest, StringContentPassedThrough) { + ov::genai::ChatHistory history; + history.push_back({{"role", "user"}, {"content", "Hello, world!"}}); + + InputRequest req = makeChatRequest(history); + TextContentNormalizationProcessor processor; + const auto status = processor.process(req); + + EXPECT_TRUE(status.ok()); + const auto& result = std::get(req.input); + EXPECT_EQ(result[0]["content"].as_string().value_or(""), "Hello, world!"); +} + +TEST(TextContentNormalizationProcessorTest, SingleTextPartFlattened) { + ov::genai::ChatHistory history; + ov::AnyMap msg = {{"role", std::string("user")}}; + ov::genai::JsonContainer parts = ov::genai::JsonContainer::from_json_string( + R"([{"type":"text","text":"hello"}])"); + msg["content"] = parts; + history.push_back(msg); + + InputRequest req = makeChatRequest(history); + TextContentNormalizationProcessor processor; + const auto status = processor.process(req); + + EXPECT_TRUE(status.ok()); + const auto& result = std::get(req.input); + EXPECT_EQ(result[0]["content"].as_string().value_or(""), "hello"); +} + +TEST(TextContentNormalizationProcessorTest, MultipleTextPartsJoinedWithNewline) { + ov::genai::ChatHistory history; + ov::AnyMap msg = {{"role", std::string("user")}}; + ov::genai::JsonContainer parts = ov::genai::JsonContainer::from_json_string( + R"([{"type":"text","text":"first"},{"type":"text","text":"second"}])"); + msg["content"] = parts; + history.push_back(msg); + + InputRequest req = makeChatRequest(history); + TextContentNormalizationProcessor processor; + const auto status = processor.process(req); + + EXPECT_TRUE(status.ok()); + const auto& result = std::get(req.input); + EXPECT_EQ(result[0]["content"].as_string().value_or(""), "first\nsecond"); +} + +TEST(TextContentNormalizationProcessorTest, NonTextPartsIgnored) { + // image_url entries alongside text: only text parts should contribute to combined string. + ov::genai::ChatHistory history; + ov::AnyMap msg = {{"role", std::string("user")}}; + ov::genai::JsonContainer parts = ov::genai::JsonContainer::from_json_string( + R"([{"type":"image_url","image_url":{"url":"http://example.com/img.png"}},{"type":"text","text":"describe this"}])"); + msg["content"] = parts; + history.push_back(msg); + + InputRequest req = makeChatRequest(history); + TextContentNormalizationProcessor processor; + const auto status = processor.process(req); + + EXPECT_TRUE(status.ok()); + const auto& result = std::get(req.input); + EXPECT_EQ(result[0]["content"].as_string().value_or(""), "describe this"); +} diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index e13cf29919..d100c92845 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -4946,8 +4946,8 @@ TEST_F(IsolatedServableTests, PromtSizeExceedsDefaultMaxPromptLenNPU) { std::vector randomData(dataSize); std::fill(randomData.begin(), randomData.end(), 1.0f); ov::Tensor tensor(ov::element::f32, {1, dataSize}, randomData.data()); - executionContext.inputIds = tensor; - auto status = legacyServable.callValidateInputComplianceWithProperties(executionContext.inputIds); + executionContext.inputRequest.inputIds = tensor; + auto status = legacyServable.callValidateInputComplianceWithProperties(executionContext.inputRequest.inputIds); ASSERT_EQ(status, absl::InvalidArgumentError("Input length exceeds the maximum allowed length")); } @@ -4960,8 +4960,8 @@ TEST_F(IsolatedServableTests, PromtSizeExceedsNonDefaultMaxPromptLenNPU) { std::vector randomData(dataSize); std::fill(randomData.begin(), randomData.end(), 1.0f); ov::Tensor tensor(ov::element::f32, {1, dataSize}, randomData.data()); - executionContext.inputIds = tensor; - auto status = legacyServable.callValidateInputComplianceWithProperties(executionContext.inputIds); + executionContext.inputRequest.inputIds = tensor; + auto status = legacyServable.callValidateInputComplianceWithProperties(executionContext.inputRequest.inputIds); ASSERT_EQ(status, absl::InvalidArgumentError("Input length exceeds the maximum allowed length")); } @@ -4974,8 +4974,8 @@ TEST_F(IsolatedServableTests, PromtSizeBetweenDefaultAndNonDefaultMaxPromptLenNP std::vector randomData(dataSize); std::fill(randomData.begin(), randomData.end(), 1.0f); ov::Tensor tensor(ov::element::f32, {1, dataSize}, randomData.data()); - executionContext.inputIds = tensor; - auto status = legacyServable.callValidateInputComplianceWithProperties(executionContext.inputIds); + executionContext.inputRequest.inputIds = tensor; + auto status = legacyServable.callValidateInputComplianceWithProperties(executionContext.inputRequest.inputIds); ASSERT_EQ(status, absl::OkStatus()); } diff --git a/src/test/llm/llmtemplate_test.cpp b/src/test/llm/llmtemplate_test.cpp index e34b4b22a5..e51833cb3b 100644 --- a/src/test/llm/llmtemplate_test.cpp +++ b/src/test/llm/llmtemplate_test.cpp @@ -118,7 +118,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateEmptyBody) { LoadTemplateProcessor(); std::string finalPrompt = ""; std::string payloadBody = ""; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), false); std::string errorOutput = "Expecting value: line 1 column 1 (char 0)"; ASSERT_EQ(finalPrompt, errorOutput); } @@ -134,7 +134,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateEmptyMessage) { "messages": [] } )"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_TRUE(finalPrompt.empty()); } @@ -149,7 +149,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateMessageWithEmptyObject) { "messages": [{}] } )"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, ""); } @@ -163,7 +163,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateDefault) { } )"; std::string expectedOutput = "User: How can I help you?"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -177,7 +177,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateMultiMessage) { } )"; std::string expectedOutput = "User: How can I help you?User: 2How can I help you?"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -193,7 +193,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateComplexMessage) { } )"; std::string expectedOutput = "User: hello"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -211,7 +211,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateJinjaUppercase) { } )"; std::string expectedOutput = " Hi, HELLO "; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -228,7 +228,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateJinjaException) { } )"; std::string errorOutput = "list object has no element 3"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), false); ASSERT_EQ(finalPrompt, errorOutput); } @@ -265,7 +265,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateComparePythonAndGenAiProcessors) { ] } )"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, pythonProcessorOutput), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, pythonProcessorOutput), true); ov::genai::Tokenizer tokenizer(directoryPath); ov::genai::ChatHistory chatHistory; chatHistory.push_back({{"role", "system"}, {"content", "You are a helpful assistant."}}); @@ -309,43 +309,43 @@ TEST_F(LLMChatTemplateTest, ChatTemplateKwargsPositive) { // Explicitly setting enable_thinking to true payloadBody = CreatePayloadBodyWithChatTemplateKwargs(R"({"enable_thinking": true})"); expectedOutput = "Thinking is on"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); // Explicitly setting enable_thinking to false payloadBody = CreatePayloadBodyWithChatTemplateKwargs(R"({"enable_thinking": false})"); expectedOutput = "Thinking is off"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); // Setting chat_template_kwargs to empty object payloadBody = CreatePayloadBodyWithChatTemplateKwargs(R"({})"); expectedOutput = "Thinking is off"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); // Explicitly setting chat_template_kwargs to null payloadBody = CreatePayloadBodyWithChatTemplateKwargs("null"); expectedOutput = "Thinking is off"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); // Setting chat_template_kwargs with multiple values including enable_thinking payloadBody = CreatePayloadBodyWithChatTemplateKwargs(R"({"enable_thinking": true, "another_param": "value"})"); expectedOutput = "Thinking is on"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); // Setting chat_template_kwargs with multiple values but without enable_thinking payloadBody = CreatePayloadBodyWithChatTemplateKwargs(R"({"another_param": "value", "yet_another_param": [1,2,3]})"); expectedOutput = "Thinking is off"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); // Default setting payloadBody = CreatePayloadBodyWithChatTemplateKwargs(""); expectedOutput = "Thinking is off"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -358,13 +358,13 @@ TEST_F(LLMChatTemplateTest, ChatTemplateKwargsNegative) { std::string finalPrompt = ""; std::string payloadBody = CreatePayloadBodyWithChatTemplateKwargs(R"("string, not_an_object")"); std::string expectedOutput = "chat_template_kwargs must be an object"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), false); ASSERT_EQ(finalPrompt, expectedOutput); // chat_template_kwargs cannot contain keys that are natively provided to the template payloadBody = CreatePayloadBodyWithChatTemplateKwargs(R"({"messages": [{"role": "user", "content": "hello"}]})"); expectedOutput = "jinja2.environment.Template.render() got multiple values for keyword argument 'messages'"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), false); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -399,7 +399,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateTojsonNoHtmlEscaping) { "tools": [{"type": "function", "function": {"name": "get_weather", "parameters": {"type": "object"}}}] } )"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); // Must contain literal tags, NOT <tools> EXPECT_THAT(finalPrompt, ::testing::HasSubstr("")); EXPECT_THAT(finalPrompt, ::testing::HasSubstr("")); @@ -435,7 +435,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateTojsonIndentWorks) { "tools": [{"type": "function", "function": {"name": "get_weather", "parameters": {"type": "object"}}}] } )"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, payloadBody, finalPrompt), true); // Must contain literal tags, NOT <tools> EXPECT_THAT(finalPrompt, ::testing::HasSubstr("")); EXPECT_THAT(finalPrompt, ::testing::HasSubstr(""));