From 922ed4e2217cf234d3439c2edd2617dca82a01ea Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 18 Jun 2026 15:59:51 +0200 Subject: [PATCH 1/9] Introduce canonical request --- src/llm/BUILD | 11 +++++ src/llm/apis/openai_api_handler.cpp | 52 ++++++++++++++++++- src/llm/apis/openai_api_handler.hpp | 11 ++++- src/llm/preprocessing/canonical_request.hpp | 55 +++++++++++++++++++++ src/test/http_openai_handler_test.cpp | 52 +++++++++++++++++++ 5 files changed, 177 insertions(+), 4 deletions(-) create mode 100644 src/llm/preprocessing/canonical_request.hpp diff --git a/src/llm/BUILD b/src/llm/BUILD index 0195973540..cb28f258b0 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -60,12 +60,23 @@ ovms_cc_library( "//src:image_conversion", "//src/filesystem:libovmsfilesystem", "@stb//:image", + ":canonical_request", ":openai_request", ":output_parsers", "//third_party:genai",], visibility = ["//visibility:public"], ) +ovms_cc_library( + name = "canonical_request", + hdrs = ["preprocessing/canonical_request.hpp"], + deps = [ + "//third_party:genai", + ":openai_request", + ], + visibility = ["//visibility:public"], +) + ovms_cc_library( name = "openai_completions_api_handler", hdrs = ["apis/openai_completions.hpp", "apis/openai_json_response.hpp"], diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index c52136d67c..74e7d7a886 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -439,7 +439,7 @@ absl::Status OpenAIApiHandler::parseTools() { return absl::OkStatus(); } -absl::StatusOr> OpenAIApiHandler::parseToolsToJsonContainer() { +absl::StatusOr> OpenAIApiHandler::parseToolsToJsonContainer() const { auto it = doc.FindMember("tools"); if (it == doc.MemberEnd() || it->value.IsNull()) { return std::nullopt; @@ -460,7 +460,7 @@ absl::StatusOr> OpenAIApiHandler::parseT } } -absl::StatusOr> OpenAIApiHandler::parseChatTemplateKwargsToJsonContainer() { +absl::StatusOr> OpenAIApiHandler::parseChatTemplateKwargsToJsonContainer() const { auto it = doc.FindMember("chat_template_kwargs"); if (it == doc.MemberEnd() || it->value.IsNull()) { return std::nullopt; @@ -492,15 +492,63 @@ const OpenAIRequest& OpenAIApiHandler::getRequest() const { return request; } +absl::StatusOr OpenAIApiHandler::buildCanonicalRequest(RendererType rendererType) const { + if (rendererType == RendererType::CPP_TOKENIZER) { + auto tools = parseToolsToJsonContainer(); + if (!tools.ok()) { + return tools.status(); + } + auto kwargs = parseChatTemplateKwargsToJsonContainer(); + if (!kwargs.ok()) { + return kwargs.status(); + } + CppPath cppPath{ + std::cref(request.chatHistory), + std::cref(request.imageHistory), + std::move(tools.value()), + std::move(kwargs.value()), + request.prompt, + true}; + return CanonicalRequest(std::move(cppPath)); + } + + PyPath pyPath{std::cref(request.processedJson)}; + return CanonicalRequest(std::move(pyPath)); +} + +absl::StatusOr OpenAIApiHandler::getCanonicalRequest(RendererType rendererType) const { + auto& cache = (rendererType == RendererType::CPP_TOKENIZER) ? cachedCppCanonicalRequest : cachedPyCanonicalRequest; + if (!cache.has_value()) { + auto canonical = buildCanonicalRequest(rendererType); + if (!canonical.ok()) { + return canonical.status(); + } + cache = std::move(canonical.value()); + } + return &(*cache); +} + const std::string& OpenAIApiHandler::getProcessedJson() const { + auto canonicalRequest = getCanonicalRequest(RendererType::PY_JINJA); + if (!canonicalRequest.ok()) { + return request.processedJson; + } return request.processedJson; } const ImageHistory& OpenAIApiHandler::getImageHistory() const { + auto canonicalRequest = getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return request.imageHistory; + } return request.imageHistory; } ov::genai::ChatHistory& OpenAIApiHandler::getChatHistory() { + auto canonicalRequest = getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return request.chatHistory; + } return request.chatHistory; } diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 30d29c0d21..5d5036ed05 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -37,6 +37,7 @@ #pragma warning(pop) #include "../io_processing/output_parser.hpp" #include "openai_request.hpp" +#include "../preprocessing/canonical_request.hpp" // Forward declarations for types only used by reference in virtual method signatures namespace ov { @@ -119,6 +120,11 @@ class OpenAIApiHandler { // Shared VLM workaround: encode text to tokens using tokenizer, validates shape std::vector encodeTextToTokens(const std::string& text); + absl::StatusOr buildCanonicalRequest(RendererType rendererType) const; + + mutable std::optional cachedCppCanonicalRequest; + mutable std::optional cachedPyCanonicalRequest; + public: OpenAIApiHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, ov::genai::Tokenizer tokenizer, const std::string& toolParserName = "", const std::string& reasoningParserName = "") : @@ -147,8 +153,8 @@ class OpenAIApiHandler { // Shared parsing (non-virtual) absl::Status parseTools(); - absl::StatusOr> parseToolsToJsonContainer(); - absl::StatusOr> parseChatTemplateKwargsToJsonContainer(); + absl::StatusOr> parseToolsToJsonContainer() const; + absl::StatusOr> parseChatTemplateKwargsToJsonContainer() const; const bool areToolsAvailable() const; // Accessors (non-virtual) @@ -159,6 +165,7 @@ class OpenAIApiHandler { const std::string& getProcessedJson() const; const ImageHistory& getImageHistory() const; ov::genai::ChatHistory& getChatHistory(); + absl::StatusOr getCanonicalRequest(RendererType rendererType) const; std::optional getMaxTokens() const; std::optional getResponseFormat() const; bool isStream() const; diff --git a/src/llm/preprocessing/canonical_request.hpp b/src/llm/preprocessing/canonical_request.hpp new file mode 100644 index 0000000000..6ad57939ef --- /dev/null +++ b/src/llm/preprocessing/canonical_request.hpp @@ -0,0 +1,55 @@ +//***************************************************************************** +// Copyright 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace ovms { + +// Forward declarations +using ImageHistory = std::vector>; + +enum class RendererType { + CPP_TOKENIZER, + PY_JINJA, +}; + +// For C++ renderer path (tokenizer.apply_chat_template) +struct CppPath { + std::reference_wrapper chatHistory; + std::reference_wrapper imageHistory; + std::optional tools; + std::optional chatTemplateKwargs; + std::optional rawPrompt; + bool addGenerationPrompt = true; +}; + +// For Python Jinja renderer path +struct PyPath { + std::reference_wrapper processedJson; +}; + +// Single variant type: either C++ data or Python data, never both +using CanonicalRequest = std::variant; + +} // namespace ovms diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 4fd28e771d..2aad565910 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -766,6 +767,57 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUser } } +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCppPathIsAvailable) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(canonicalRequest.ok()); + ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); + + const auto& cppPath = std::get(*canonicalRequest.value()); + EXPECT_EQ(&cppPath.chatHistory.get(), &apiHandler->getChatHistory()); + EXPECT_EQ(&cppPath.imageHistory.get(), &apiHandler->getImageHistory()); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestPyPathIsAvailable) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(canonicalRequest.ok()); + ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); + + const auto& pyPath = std::get(*canonicalRequest.value()); + EXPECT_EQ(&pyPath.processedJson.get(), &apiHandler->getProcessedJson()); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, LegacyGettersRemainCompatibleWithCanonicalCache) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto& chatHistory = apiHandler->getChatHistory(); + const auto& imageHistory = apiHandler->getImageHistory(); + const auto& processedJson = apiHandler->getProcessedJson(); + + auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + auto pyCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(cppCanonical.ok()); + ASSERT_TRUE(pyCanonical.ok()); + + ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); + ASSERT_TRUE(std::holds_alternative(*pyCanonical.value())); + + const auto& cppPath = std::get(*cppCanonical.value()); + const auto& pyPath = std::get(*pyCanonical.value()); + EXPECT_EQ(&cppPath.chatHistory.get(), &chatHistory); + EXPECT_EQ(&cppPath.imageHistory.get(), &imageHistory); + EXPECT_EQ(&pyPath.processedJson.get(), &processedJson); +} + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquivalentMessages) { std::string json = createTextRequest("What is OpenVINO?"); auto apiHandler = parseCurrentRequest(json); From be6f7ee863e15b1fe8d587303567df650e83b046 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 18 Jun 2026 16:31:49 +0200 Subject: [PATCH 2/9] Initial --- src/llm/servable.cpp | 248 +++++++++++++++----------- src/test/http_openai_handler_test.cpp | 60 +++++++ 2 files changed, 205 insertions(+), 103 deletions(-) diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index b4d5ca8185..6f677c4ff3 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -41,6 +41,142 @@ namespace ovms { +namespace { + +#if (PYTHON_DISABLE != 0) +absl::Status applyTokenizerChatTemplate( + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + std::string& inputText) { + ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); + constexpr bool addGenerationPrompt = true; + auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); + if (!toolsStatus.ok()) { + return toolsStatus.status(); + } + const auto& tools = toolsStatus.value(); + auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); + if (!chatTemplateKwargsStatus.ok()) { + return chatTemplateKwargsStatus.status(); + } + const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); + try { + inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + } catch (const std::exception& e) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); + return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); + } + return absl::OkStatus(); +} +#endif + +absl::Status detectImplicitReasoningStartIfNeeded( + const std::shared_ptr& executionContext, + const std::string& inputText) { + if (executionContext->apiHandler->getOutputParser() != nullptr) { + executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); + } + return absl::OkStatus(); +} + +absl::Status buildChatCompletionsInputText( + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + std::string& inputText) { +#if (PYTHON_DISABLE == 0) + bool success; + if (executionContext->apiHandler->getProcessedJson().size() > 0) { + success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); + } else { + success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, executionContext->payload.body, inputText); + } + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + } +#else + auto status = applyTokenizerChatTemplate(executionContext, properties, inputText); + if (!status.ok()) { + return status; + } +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } + return detectImplicitReasoningStartIfNeeded(executionContext, inputText); +} + +absl::Status buildResponsesInputText( + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + std::string& inputText) { + if (executionContext->apiHandler->getChatHistory().size() > 0) { +#if (PYTHON_DISABLE == 0) + bool success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + } +#else + auto status = applyTokenizerChatTemplate(executionContext, properties, inputText); + if (!status.ok()) { + return status; + } +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } + return detectImplicitReasoningStartIfNeeded(executionContext, inputText); + } + auto prompt = executionContext->apiHandler->getPrompt(); + if (!prompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); + } + inputText = prompt.value(); + return absl::OkStatus(); +} + +absl::Status buildInputTextForEndpoint( + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + std::string& inputText) { + switch (executionContext->endpoint) { + case Endpoint::CHAT_COMPLETIONS: + return buildChatCompletionsInputText(executionContext, properties, inputText); + case Endpoint::RESPONSES: + return buildResponsesInputText(executionContext, properties, inputText); + case Endpoint::COMPLETIONS: + inputText = executionContext->apiHandler->getPrompt().value(); + return absl::OkStatus(); + case Endpoint::TOKENIZE: + return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage"); + } + return absl::InternalError("Unsupported endpoint"); +} + +absl::Status encodeAndValidateInputIds( + const std::shared_ptr& executionContext, + const std::shared_ptr& properties, + const std::string& inputText) { + bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS); + executionContext->inputIds = properties->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; + if (properties->maxModelLength.has_value()) { + if (executionContext->inputIds.get_size() > properties->maxModelLength.value()) { + std::stringstream ss; + ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " exceeds model max length: " << properties->maxModelLength.value(); + SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); + return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); + } + if (executionContext->apiHandler->getMaxTokens().has_value() && executionContext->inputIds.get_size() + executionContext->apiHandler->getMaxTokens().value() > properties->maxModelLength.value()) { + std::stringstream ss; + ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " + max tokens value: " << executionContext->apiHandler->getMaxTokens().value() << " exceeds model max length: " << properties->maxModelLength.value(); + SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); + return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); + } + } + return absl::OkStatus(); +} + +} // namespace + void GenAiServable::determineDecodingMethod() { getProperties()->decodingMethod = DecodingMethod::STANDARD; auto& pluginConfig = getProperties()->pluginConfig; @@ -170,6 +306,7 @@ absl::Status GenAiServable::parseRequest(std::shared_ptr& executionContext) { + auto properties = getProperties(); if (executionContext->apiHandler == nullptr) { return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized"); } @@ -180,113 +317,18 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrendpoint) { - case Endpoint::CHAT_COMPLETIONS: { -#if (PYTHON_DISABLE == 0) - bool success; - if (executionContext->apiHandler->getProcessedJson().size() > 0) { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - } else { - success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->payload.body, inputText); - } - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); - } -#else - ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); - } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); - try { - inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); - } -#endif - if (inputText.size() == 0) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (executionContext->apiHandler->getOutputParser() != nullptr) { - executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); - } - break; - } - case Endpoint::RESPONSES: { - if (executionContext->apiHandler->getChatHistory().size() > 0) { -#if (PYTHON_DISABLE == 0) - bool success = PyJinjaTemplateProcessor::applyChatTemplate(getProperties()->templateProcessor, getProperties()->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); - } -#else - ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; - auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); - } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); - try { - inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); - } catch (const std::exception& e) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); - return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); - } -#endif - if (inputText.size() == 0) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); - } - if (executionContext->apiHandler->getOutputParser() != nullptr) { - executionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(inputText); - } - } else { - auto prompt = executionContext->apiHandler->getPrompt(); - if (!prompt.has_value()) { - return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); - } - inputText = prompt.value(); - } - break; - } - case Endpoint::COMPLETIONS: { - inputText = executionContext->apiHandler->getPrompt().value(); - break; - } - case Endpoint::TOKENIZE: - return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage"); + auto inputTextStatus = buildInputTextForEndpoint(executionContext, properties, inputText); + if (!inputTextStatus.ok()) { + return inputTextStatus; } + if (Config::instance().getServerSettings().verboseResponse) { executionContext->apiHandler->enableVerboseResponse(inputText); } - bool encodeAddSpecialTokens = (executionContext->endpoint == Endpoint::COMPLETIONS); - executionContext->inputIds = getProperties()->tokenizer.encode(inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; - if (getProperties()->maxModelLength.has_value()) { - if (executionContext->inputIds.get_size() > getProperties()->maxModelLength.value()) { - std::stringstream ss; - ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " exceeds model max length: " << getProperties()->maxModelLength.value(); - SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); - return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); - } - if (executionContext->apiHandler->getMaxTokens().has_value() && executionContext->inputIds.get_size() + executionContext->apiHandler->getMaxTokens().value() > getProperties()->maxModelLength.value()) { - std::stringstream ss; - ss << "Number of prompt tokens: " << executionContext->inputIds.get_size() << " + max tokens value: " << executionContext->apiHandler->getMaxTokens().value() << " exceeds model max length: " << getProperties()->maxModelLength.value(); - SPDLOG_LOGGER_ERROR(llm_calculator_logger, ss.str()); - return absl::Status(absl::StatusCode::kInvalidArgument, ss.str()); - } + + auto encodeStatus = encodeAndValidateInputIds(executionContext, properties, inputText); + if (!encodeStatus.ok()) { + return encodeStatus; } executionContext->apiHandler->setPromptTokensUsage(executionContext->inputIds.get_size()); diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 2aad565910..b2e846b3de 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -818,6 +818,66 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, LegacyGettersRemainCompatib EXPECT_EQ(&pyPath.processedJson.get(), &processedJson); } +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCacheReturnsStableAddressPerRenderer) { + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto cppCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + auto cppCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(cppCanonicalFirst.ok()); + ASSERT_TRUE(cppCanonicalSecond.ok()); + EXPECT_EQ(cppCanonicalFirst.value(), cppCanonicalSecond.value()); + + auto pyCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + auto pyCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(pyCanonicalFirst.ok()); + ASSERT_TRUE(pyCanonicalSecond.ok()); + EXPECT_EQ(pyCanonicalFirst.value(), pyCanonicalSecond.value()); +} + +TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCppPathContainsTemplateInputs) { + if (endpoint() != ovms::Endpoint::CHAT_COMPLETIONS) { + GTEST_SKIP() << "Tools/chat_template_kwargs assertions apply to chat/completions flow"; + } + + std::string json = createTextRequest( + "What is OpenVINO?", + R"(, + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + } + ], + "chat_template_kwargs": {"enable_thinking": true} + )"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(cppCanonical.ok()); + ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); + + const auto& cppPath = std::get(*cppCanonical.value()); + EXPECT_TRUE(cppPath.addGenerationPrompt); + ASSERT_TRUE(cppPath.tools.has_value()); + ASSERT_TRUE(cppPath.chatTemplateKwargs.has_value()); + + const auto& kwargs = cppPath.chatTemplateKwargs.value(); + ASSERT_TRUE(kwargs["enable_thinking"].as_bool().has_value()); + EXPECT_TRUE(kwargs["enable_thinking"].as_bool().value()); +} + TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquivalentMessages) { std::string json = createTextRequest("What is OpenVINO?"); auto apiHandler = parseCurrentRequest(json); From 4afc919c208bae47142baad7e45d5b9827e5b2e9 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 18 Jun 2026 16:40:56 +0200 Subject: [PATCH 3/9] Adjust both handlers --- src/llm/BUILD | 2 + src/llm/apis/openai_api_handler.cpp | 22 +------ src/llm/apis/openai_api_handler.hpp | 2 + src/llm/apis/openai_completions.cpp | 33 ++++++++++ src/llm/apis/openai_completions.hpp | 1 + src/llm/apis/openai_responses.cpp | 33 ++++++++++ src/llm/apis/openai_responses.hpp | 1 + .../continuous_batching/servable.cpp | 24 ++----- .../visual_language_model/legacy/servable.cpp | 23 ++----- src/test/http_openai_handler_test.cpp | 65 ++++++++++++++++++- 10 files changed, 150 insertions(+), 56 deletions(-) diff --git a/src/llm/BUILD b/src/llm/BUILD index cb28f258b0..10d8a33c70 100644 --- a/src/llm/BUILD +++ b/src/llm/BUILD @@ -307,6 +307,7 @@ ovms_cc_library( "language_model/continuous_batching/llm_executor.hpp", "language_model/continuous_batching/servable_initializer.hpp", "visual_language_model/continuous_batching/servable.hpp", + "visual_language_model/image_prompt_utils.hpp", "language_model/legacy/servable.hpp", "language_model/legacy/servable_initializer.hpp", "language_model/legacy/legacy_executor.hpp", @@ -318,6 +319,7 @@ ovms_cc_library( "servable_initializer.cpp", "language_model/continuous_batching/servable.cpp", "language_model/continuous_batching/servable_initializer.cpp", + "visual_language_model/image_prompt_utils.cpp", "visual_language_model/continuous_batching/servable.cpp", "language_model/legacy/servable.cpp", "language_model/legacy/servable_initializer.cpp", diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index 74e7d7a886..7efaedd5aa 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -493,27 +493,7 @@ const OpenAIRequest& OpenAIApiHandler::getRequest() const { } absl::StatusOr OpenAIApiHandler::buildCanonicalRequest(RendererType rendererType) const { - if (rendererType == RendererType::CPP_TOKENIZER) { - auto tools = parseToolsToJsonContainer(); - if (!tools.ok()) { - return tools.status(); - } - auto kwargs = parseChatTemplateKwargsToJsonContainer(); - if (!kwargs.ok()) { - return kwargs.status(); - } - CppPath cppPath{ - std::cref(request.chatHistory), - std::cref(request.imageHistory), - std::move(tools.value()), - std::move(kwargs.value()), - request.prompt, - true}; - return CanonicalRequest(std::move(cppPath)); - } - - PyPath pyPath{std::cref(request.processedJson)}; - return CanonicalRequest(std::move(pyPath)); + return buildCanonicalRequestImpl(rendererType); } absl::StatusOr OpenAIApiHandler::getCanonicalRequest(RendererType rendererType) const { diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 5d5036ed05..4391cad9fa 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -120,10 +120,12 @@ class OpenAIApiHandler { // Shared VLM workaround: encode text to tokens using tokenizer, validates shape std::vector encodeTextToTokens(const std::string& text); + virtual absl::StatusOr buildCanonicalRequestImpl(RendererType rendererType) const = 0; absl::StatusOr buildCanonicalRequest(RendererType rendererType) const; mutable std::optional cachedCppCanonicalRequest; mutable std::optional cachedPyCanonicalRequest; + mutable std::optional synthesizedProcessedJson; public: OpenAIApiHandler(Document& doc, Endpoint endpoint, std::chrono::time_point creationTime, diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 89009c0d74..ef6cd9ad32 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -275,6 +275,39 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional OpenAIChatCompletionsHandler::buildCanonicalRequestImpl(RendererType rendererType) const { + if (rendererType == RendererType::CPP_TOKENIZER) { + auto tools = parseToolsToJsonContainer(); + if (!tools.ok()) { + return tools.status(); + } + auto kwargs = parseChatTemplateKwargsToJsonContainer(); + if (!kwargs.ok()) { + return kwargs.status(); + } + CppPath cppPath{ + std::cref(request.chatHistory), + std::cref(request.imageHistory), + std::move(tools.value()), + std::move(kwargs.value()), + request.prompt, + true}; + return CanonicalRequest(std::move(cppPath)); + } + + if (request.processedJson.size() > 0) { + PyPath pyPath{std::cref(request.processedJson)}; + return CanonicalRequest(std::move(pyPath)); + } + + StringBuffer buffer; + Writer writer(buffer); + doc.Accept(writer); + synthesizedProcessedJson = buffer.GetString(); + PyPath pyPath{std::cref(synthesizedProcessedJson.value())}; + return CanonicalRequest(std::move(pyPath)); +} + // --- Unary response serialization --- std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const std::vector& generationOutputs) { diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index cbb8f2645f..6f93be80c0 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -36,6 +36,7 @@ class OpenAIChatCompletionsHandler : public OpenAIApiHandler { absl::Status parseRequest(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength, std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt) override; absl::Status parseMessages(std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt); + absl::StatusOr buildCanonicalRequestImpl(RendererType rendererType) const override; std::string serializeUnaryResponse(const std::vector& generationOutputs) override; std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override; diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index 1f984722dc..e18fcc882a 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -829,6 +829,39 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional return parseResponseFormat(); } +absl::StatusOr OpenAIResponsesHandler::buildCanonicalRequestImpl(RendererType rendererType) const { + if (rendererType == RendererType::CPP_TOKENIZER) { + auto tools = parseToolsToJsonContainer(); + if (!tools.ok()) { + return tools.status(); + } + auto kwargs = parseChatTemplateKwargsToJsonContainer(); + if (!kwargs.ok()) { + return kwargs.status(); + } + CppPath cppPath{ + std::cref(request.chatHistory), + std::cref(request.imageHistory), + std::move(tools.value()), + std::move(kwargs.value()), + request.prompt, + true}; + return CanonicalRequest(std::move(cppPath)); + } + + if (request.processedJson.size() > 0) { + PyPath pyPath{std::cref(request.processedJson)}; + return CanonicalRequest(std::move(pyPath)); + } + + StringBuffer buffer; + Writer writer(buffer); + doc.Accept(writer); + synthesizedProcessedJson = buffer.GetString(); + PyPath pyPath{std::cref(synthesizedProcessedJson.value())}; + return CanonicalRequest(std::move(pyPath)); +} + // --- Serialization helpers --- void OpenAIResponsesHandler::serializeToolChoice(Writer& writer) const { diff --git a/src/llm/apis/openai_responses.hpp b/src/llm/apis/openai_responses.hpp index 6a10400952..dc30bc6111 100644 --- a/src/llm/apis/openai_responses.hpp +++ b/src/llm/apis/openai_responses.hpp @@ -94,6 +94,7 @@ class OpenAIResponsesHandler : public OpenAIApiHandler { absl::Status parseRequest(std::optional maxTokensLimit, uint32_t bestOfLimit, std::optional maxModelLength, std::optional allowedLocalMediaPath = std::nullopt, std::optional> allowedMediaDomains = std::nullopt) override; + absl::StatusOr buildCanonicalRequestImpl(RendererType rendererType) const override; std::string serializeUnaryResponse(const std::vector& generationOutputs) override; std::string serializeUnaryResponse(ov::genai::EncodedResults& results) override; diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 7779d9c0be..b7ccc377ce 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -25,6 +25,7 @@ #include "../../../config.hpp" #include "../../../logging.hpp" #include "../../text_utils.hpp" +#include "../image_prompt_utils.hpp" #include "../../../tokenize/tokenize_parser.hpp" namespace ovms { @@ -73,26 +74,15 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); - } + auto restrictedTagStatus = vlm::rejectRestrictedImageTags(chatHistory); + if (!restrictedTagStatus.ok()) { + return restrictedTagStatus; } const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); - } - - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + auto imagePlacementStatus = vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, vlmExecutionContext->inputImages); + if (!imagePlacementStatus.ok()) { + return imagePlacementStatus; } constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 9c8e02c5df..0a8c6fc3e2 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -38,6 +38,7 @@ #include "../../../http_payload.hpp" #include "../../../mediapipe_internal/mediapipe_utils.hpp" #include "../../text_utils.hpp" +#include "../image_prompt_utils.hpp" #include "../../../tokenize/tokenize_parser.hpp" #if (PYTHON_DISABLE == 0) #include "../../py_jinja_template_processor.hpp" @@ -282,25 +283,15 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); - for (size_t i = 0; i < chatHistory.size(); i++) { - const auto& message = chatHistory[i]; - if (message["content"].as_string().value_or("").find(" tag"); - } + auto restrictedTagStatus = vlm::rejectRestrictedImageTags(chatHistory); + if (!restrictedTagStatus.ok()) { + return restrictedTagStatus; } const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); - size_t imageIndex = 0; - std::unordered_map imageTags; - for (const auto& image : imageHistory) { - const auto& [chatTurnIndex, imageTensor] = image; - std::string imageTag = "\n"; - imageTags[chatTurnIndex] = imageTags[chatTurnIndex] + imageTag; - vlmExecutionContext->inputImages.push_back(imageTensor); - } - for (const auto& [chatTurnIndex, imageTagString] : imageTags) { - std::string messageContent = chatHistory[chatTurnIndex]["content"].as_string().value_or(""); - chatHistory[chatTurnIndex]["content"] = imageTagString + messageContent; + auto imagePlacementStatus = vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, vlmExecutionContext->inputImages); + if (!imagePlacementStatus.ok()) { + return imagePlacementStatus; } constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index b2e846b3de..1a66684dd5 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -29,6 +29,7 @@ #include "../filesystem/filesystem.hpp" #include "../llm/apis/openai_completions.hpp" #include "../llm/apis/openai_responses.hpp" +#include "../llm/visual_language_model/image_prompt_utils.hpp" #include #include "../module_names.hpp" #include "../servablemanagermodule.hpp" @@ -791,7 +792,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestPyPathIsAva ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); const auto& pyPath = std::get(*canonicalRequest.value()); - EXPECT_EQ(&pyPath.processedJson.get(), &apiHandler->getProcessedJson()); + EXPECT_FALSE(pyPath.processedJson.get().empty()); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, LegacyGettersRemainCompatibleWithCanonicalCache) { @@ -815,7 +816,11 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, LegacyGettersRemainCompatib const auto& pyPath = std::get(*pyCanonical.value()); EXPECT_EQ(&cppPath.chatHistory.get(), &chatHistory); EXPECT_EQ(&cppPath.imageHistory.get(), &imageHistory); - EXPECT_EQ(&pyPath.processedJson.get(), &processedJson); + if (!processedJson.empty()) { + EXPECT_EQ(&pyPath.processedJson.get(), &processedJson); + } else { + EXPECT_FALSE(pyPath.processedJson.get().empty()); + } } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCacheReturnsStableAddressPerRenderer) { @@ -5558,6 +5563,62 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageHistoryIndexMatchesChatHistor EXPECT_LT(turnIndex, chatHistory.size()); } + TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsRejectsRestrictedImageTag) { + std::string json = R"({ + "model": "llama", + "messages": [{"role": "user", "content": "prefix "}] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + auto status = ovms::vlm::rejectRestrictedImageTags(apiHandler->getChatHistory()); + EXPECT_EQ(status, absl::InvalidArgumentError("Message contains restricted tag")); + } + + TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsInjectsImageTagsAndCollectsTensors) { + const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; + std::string json = R"({ + "model": "llama", + "messages": [ + {"role":"user","content":[ + {"type":"text","text":"what is in these images?"}, + {"type":"image_url","image_url":{"url":")" + + base64Image + R"("}}, + {"type":"image_url","image_url":{"url":")" + + base64Image + R"("}} + ]} + ] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + auto& chatHistory = apiHandler->getChatHistory(); + const auto& imageHistory = apiHandler->getImageHistory(); + ASSERT_EQ(chatHistory.size(), 1u); + ASSERT_EQ(imageHistory.size(), 2u); + + std::vector inputImages; + auto status = ovms::vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, inputImages); + EXPECT_EQ(status, absl::OkStatus()); + + ASSERT_EQ(inputImages.size(), 2u); + std::string content = chatHistory[0]["content"].as_string().value_or(""); + EXPECT_THAT(content, ::testing::HasSubstr("\n\n")); + EXPECT_THAT(content, ::testing::HasSubstr("what is in these images?")); + } + // --- Tools normalisation edge cases --- TEST_F(HttpOpenAIHandlerParsingTest, ResponsesFlatToolWithoutParametersIsNormalised) { From 298d855569126e97aee868a8d38b4421da452dda Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 18 Jun 2026 16:43:55 +0200 Subject: [PATCH 4/9] Handlers adjustments --- src/llm/apis/openai_api_handler.cpp | 7 +++++-- src/test/http_openai_handler_test.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index 7efaedd5aa..42c0f8bc2b 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -510,8 +510,11 @@ absl::StatusOr OpenAIApiHandler::getCanonicalRequest(Re const std::string& OpenAIApiHandler::getProcessedJson() const { auto canonicalRequest = getCanonicalRequest(RendererType::PY_JINJA); - if (!canonicalRequest.ok()) { - return request.processedJson; + if (canonicalRequest.ok()) { + const auto* pyPath = std::get_if(*canonicalRequest.value()); + if (pyPath != nullptr) { + return pyPath->processedJson.get(); + } } return request.processedJson; } diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 1a66684dd5..582f12809a 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -763,8 +763,8 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUser EXPECT_EQ(chatHistory[0]["role"], "user"); EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); if (endpoint() == ovms::Endpoint::CHAT_COMPLETIONS) { - // Chat completions with simple text does not mutate the JSON, so processedJson is empty - EXPECT_TRUE(apiHandler->getProcessedJson().empty()); + // Canonical PyPath should always provide a JSON payload for template processing. + EXPECT_FALSE(apiHandler->getProcessedJson().empty()); } } @@ -913,7 +913,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquiva } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -958,7 +958,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonEquivalentMult } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -984,7 +984,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonIncludesToolsW } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_TRUE(apiHandler->getProcessedJson().empty()) << "processedJson should be empty when Python is disabled"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } From 6a301a08332498ead2aa97eafa3d4d8c6ea49a4b Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 18 Jun 2026 16:48:28 +0200 Subject: [PATCH 5/9] response handlers produced only one needed output --- src/llm/apis/openai_responses.cpp | 109 ++++++++++++-------------- src/test/http_openai_handler_test.cpp | 9 +++ 2 files changed, 57 insertions(+), 61 deletions(-) diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index e18fcc882a..6f68029007 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -749,64 +749,6 @@ absl::Status OpenAIResponsesHandler::parseResponsesPart(std::optional return toolsStatus; } -#if (PYTHON_DISABLE == 0) - // Build processedJson with a "messages" array in chat/completions format so that - // the Python Jinja template path can consume Responses API input without a separate code path. - // Handles reasoning, function_call (merged into assistant tool_calls), and - // function_call_output (converted to role:tool messages). - // - // Built after parseTools() so any tool filtering (e.g. tool_choice removing - // unselected tools) is reflected here, and so parseTools()'s own write to - // request.processedJson (Responses-shaped doc with "input") does not - // clobber the chat/completions-shaped JSON the Python Jinja path expects. - { - Document processedDoc; - processedDoc.SetObject(); - auto& alloc = processedDoc.GetAllocator(); - - Value messagesArray(kArrayType); - - auto inputArrIt = doc.FindMember("input"); - if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsArray()) { - ProcessedJsonSink sink(messagesArray, alloc); - ResponsesInputBuilder builder(sink); - auto processedStatus = builder.build(inputArrIt->value); - if (!processedStatus.ok()) { - return processedStatus; - } - } else if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsString()) { - // String input: emit a single user message so the Python Jinja path - // sees the same content the C++ chatHistory path does. - Value msgObj(kObjectType); - msgObj.AddMember("role", Value("user", alloc), alloc); - msgObj.AddMember("content", Value(inputArrIt->value.GetString(), alloc), alloc); - messagesArray.PushBack(msgObj, alloc); - } - - processedDoc.AddMember("messages", messagesArray, alloc); - - // Tools were already normalised to chat/completions nested format by - // convertResponsesToolsInPlace earlier in parseResponsesPart — just copy verbatim. - auto processedToolsIt = doc.FindMember("tools"); - if (processedToolsIt != doc.MemberEnd() && !processedToolsIt->value.IsNull()) { - Value toolsCopy(processedToolsIt->value, alloc); - processedDoc.AddMember("tools", toolsCopy, alloc); - } - - // Copy chat_template_kwargs from original doc if present - auto kwargsIt = doc.FindMember("chat_template_kwargs"); - if (kwargsIt != doc.MemberEnd() && !kwargsIt->value.IsNull()) { - Value kwargsCopy(kwargsIt->value, alloc); - processedDoc.AddMember("chat_template_kwargs", kwargsCopy, alloc); - } - - StringBuffer buffer; - Writer writer(buffer); - processedDoc.Accept(writer); - request.processedJson = buffer.GetString(); - } -#endif - // max_output_tokens: uint; optional // OpenAI Responses API uses this field for output token limit. it = doc.FindMember("max_output_tokens"); @@ -849,17 +791,62 @@ absl::StatusOr OpenAIResponsesHandler::buildCanonicalRequestIm return CanonicalRequest(std::move(cppPath)); } - if (request.processedJson.size() > 0) { - PyPath pyPath{std::cref(request.processedJson)}; - return CanonicalRequest(std::move(pyPath)); +#if (PYTHON_DISABLE == 0) + Document processedDoc; + processedDoc.SetObject(); + auto& alloc = processedDoc.GetAllocator(); + + Value messagesArray(kArrayType); + + auto inputArrIt = doc.FindMember("input"); + if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsArray()) { + ProcessedJsonSink sink(messagesArray, alloc); + ResponsesInputBuilder builder(sink); + auto processedStatus = builder.build(inputArrIt->value); + if (!processedStatus.ok()) { + return processedStatus; + } + } else if (inputArrIt != doc.MemberEnd() && inputArrIt->value.IsString()) { + // String input: emit a single user message so the Python Jinja path + // sees the same content the C++ chatHistory path does. + Value msgObj(kObjectType); + msgObj.AddMember("role", Value("user", alloc), alloc); + msgObj.AddMember("content", Value(inputArrIt->value.GetString(), alloc), alloc); + messagesArray.PushBack(msgObj, alloc); + } + + processedDoc.AddMember("messages", messagesArray, alloc); + + // Tools were already normalised to chat/completions nested format by + // convertResponsesToolsInPlace in parseResponsesPart — just copy verbatim. + auto processedToolsIt = doc.FindMember("tools"); + if (processedToolsIt != doc.MemberEnd() && !processedToolsIt->value.IsNull()) { + Value toolsCopy(processedToolsIt->value, alloc); + processedDoc.AddMember("tools", toolsCopy, alloc); + } + + // Copy chat_template_kwargs from original doc if present. + auto kwargsIt = doc.FindMember("chat_template_kwargs"); + if (kwargsIt != doc.MemberEnd() && !kwargsIt->value.IsNull()) { + Value kwargsCopy(kwargsIt->value, alloc); + processedDoc.AddMember("chat_template_kwargs", kwargsCopy, alloc); } + StringBuffer buffer; + Writer writer(buffer); + processedDoc.Accept(writer); + synthesizedProcessedJson = buffer.GetString(); + PyPath pyPath{std::cref(synthesizedProcessedJson.value())}; + return CanonicalRequest(std::move(pyPath)); +#else + // When Python support is disabled, keep a best-effort canonical payload. StringBuffer buffer; Writer writer(buffer); doc.Accept(writer); synthesizedProcessedJson = buffer.GetString(); PyPath pyPath{std::cref(synthesizedProcessedJson.value())}; return CanonicalRequest(std::move(pyPath)); +#endif } // --- Serialization helpers --- diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 582f12809a..78d291c104 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -793,6 +793,15 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestPyPathIsAva const auto& pyPath = std::get(*canonicalRequest.value()); EXPECT_FALSE(pyPath.processedJson.get().empty()); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + rapidjson::Document processedDoc; + processedDoc.Parse(pyPath.processedJson.get().c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_TRUE(processedDoc["messages"].IsArray()); + ASSERT_GE(processedDoc["messages"].Size(), 1u); + } } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, LegacyGettersRemainCompatibleWithCanonicalCache) { From 233466f83249b11d48f008d3863abd66dd86c4e5 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 19 Jun 2026 10:21:52 +0200 Subject: [PATCH 6/9] fix --- src/llm/apis/openai_api_handler.cpp | 11 +-- src/llm/apis/openai_completions.cpp | 6 -- src/llm/servable.cpp | 102 +++++++++++++++++--------- src/test/http_openai_handler_test.cpp | 7 +- 4 files changed, 70 insertions(+), 56 deletions(-) diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index 42c0f8bc2b..3ca8eb94be 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -348,11 +348,9 @@ absl::Status OpenAIApiHandler::parseTools() { return absl::InvalidArgumentError("tool_choice is not a valid JSON object or string"); } } - bool jsonChanged = false; if (toolChoice == "none") { // remove tools from the request doc.RemoveMember("tools"); - jsonChanged = true; } auto it = doc.FindMember("tools"); if (it != doc.MemberEnd() && !it->value.IsNull()) { @@ -405,7 +403,6 @@ absl::Status OpenAIApiHandler::parseTools() { // If toolChoice is set to a specific function name, we keep only that tool if (toolChoice != "auto" && toolChoice != "required" && toolChoice != functionName) { it->value.Erase(&obj); - jsonChanged = true; continue; } @@ -430,12 +427,6 @@ absl::Status OpenAIApiHandler::parseTools() { } request.toolChoice = toolChoice; - if (jsonChanged) { - StringBuffer buffer; - Writer writer(buffer); - doc.Accept(writer); - request.processedJson = buffer.GetString(); - } return absl::OkStatus(); } @@ -511,7 +502,7 @@ absl::StatusOr OpenAIApiHandler::getCanonicalRequest(Re const std::string& OpenAIApiHandler::getProcessedJson() const { auto canonicalRequest = getCanonicalRequest(RendererType::PY_JINJA); if (canonicalRequest.ok()) { - const auto* pyPath = std::get_if(*canonicalRequest.value()); + const auto* pyPath = std::get_if(canonicalRequest.value()); if (pyPath != nullptr) { return pyPath->processedJson.get(); } diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index ef6cd9ad32..dc19bd220f 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -265,12 +265,6 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages(std::optional writer(buffer); - doc.Accept(writer); - request.processedJson = buffer.GetString(); - } SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Parsed messages successfully"); return absl::OkStatus(); } diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 6f677c4ff3..5bef6cf2f4 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -37,6 +37,7 @@ #include "apis/openai_responses.hpp" #include "servable.hpp" #include "text_utils.hpp" +#include #include "../tokenize/tokenize_parser.hpp" namespace ovms { @@ -45,23 +46,11 @@ namespace { #if (PYTHON_DISABLE != 0) absl::Status applyTokenizerChatTemplate( - const std::shared_ptr& executionContext, + const CppPath& cppPath, const std::shared_ptr& properties, std::string& inputText) { - ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory(); - constexpr bool addGenerationPrompt = true; - auto toolsStatus = executionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); - } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = executionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); try { - inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + inputText = properties->tokenizer.apply_chat_template(cppPath.chatHistory.get(), cppPath.addGenerationPrompt, {}, cppPath.tools, cppPath.chatTemplateKwargs); } catch (const std::exception& e) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what()); return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one."); @@ -70,6 +59,19 @@ absl::Status applyTokenizerChatTemplate( } #endif + +#if (PYTHON_DISABLE == 0) +absl::Status applyPythonChatTemplate( + const PyPath& pyPath, + const std::shared_ptr& properties, + std::string& inputText) { + bool success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, pyPath.processedJson.get(), inputText); + if (!success) { + return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + } + return absl::OkStatus(); +} +#endif absl::Status detectImplicitReasoningStartIfNeeded( const std::shared_ptr& executionContext, const std::string& inputText) { @@ -84,17 +86,28 @@ absl::Status buildChatCompletionsInputText( const std::shared_ptr& properties, std::string& inputText) { #if (PYTHON_DISABLE == 0) - bool success; - if (executionContext->apiHandler->getProcessedJson().size() > 0) { - success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - } else { - success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, executionContext->payload.body, inputText); + auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::PY_JINJA); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); } - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + const auto* pyPath = std::get_if(canonicalRequest.value()); + if (pyPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for Python renderer"); + } + auto status = applyPythonChatTemplate(*pyPath, properties, inputText); + if (!status.ok()) { + return status; } #else - auto status = applyTokenizerChatTemplate(executionContext, properties, inputText); + auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); + } + const auto* cppPath = std::get_if(canonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + auto status = applyTokenizerChatTemplate(*cppPath, properties, inputText); if (!status.ok()) { return status; } @@ -109,29 +122,46 @@ absl::Status buildResponsesInputText( const std::shared_ptr& executionContext, const std::shared_ptr& properties, std::string& inputText) { - if (executionContext->apiHandler->getChatHistory().size() > 0) { #if (PYTHON_DISABLE == 0) - bool success = PyJinjaTemplateProcessor::applyChatTemplate(properties->templateProcessor, properties->modelsPath, executionContext->apiHandler->getProcessedJson(), inputText); - if (!success) { - return absl::Status(absl::StatusCode::kInvalidArgument, inputText); + auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::PY_JINJA); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); + } + const auto* pyPath = std::get_if(canonicalRequest.value()); + if (pyPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for Python renderer"); + } + auto status = applyPythonChatTemplate(*pyPath, properties, inputText); + if (!status.ok()) { + return status; } #else - auto status = applyTokenizerChatTemplate(executionContext, properties, inputText); + auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); + } + const auto* cppPath = std::get_if(canonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + if (cppPath->chatHistory.get().size() > 0) { + auto status = applyTokenizerChatTemplate(*cppPath, properties, inputText); if (!status.ok()) { return status; } -#endif - if (inputText.size() == 0) { - return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); + } else { + auto prompt = cppPath->rawPrompt; + if (!prompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); } - return detectImplicitReasoningStartIfNeeded(executionContext, inputText); + inputText = prompt.value(); + return absl::OkStatus(); } - auto prompt = executionContext->apiHandler->getPrompt(); - if (!prompt.has_value()) { - return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); +#endif + if (inputText.size() == 0) { + return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty"); } - inputText = prompt.value(); - return absl::OkStatus(); + return detectImplicitReasoningStartIfNeeded(executionContext, inputText); } absl::Status buildInputTextForEndpoint( diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 78d291c104..f065b44a58 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -897,9 +897,8 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquiva auto apiHandler = parseCurrentRequest(json); ASSERT_NE(apiHandler, nullptr); - // For Responses, processedJson is always built from chatHistory. - // For chat/completions with simple text, processedJson is empty (original body is used instead). - // In both cases, the chatHistory should be equivalent. + // Canonical PyPath provides processedJson lazily for both endpoints. + // In all cases, chatHistory should stay equivalent to parsed input. auto& chatHistory = apiHandler->getChatHistory(); ASSERT_EQ(chatHistory.size(), 1); EXPECT_EQ(chatHistory[0]["role"], "user"); @@ -907,7 +906,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquiva #if (PYTHON_DISABLE == 0) if (endpoint() == ovms::Endpoint::RESPONSES) { - // Responses path builds processedJson with messages array + // Responses canonical path builds processedJson with messages array. const std::string& processedJson = apiHandler->getProcessedJson(); ASSERT_FALSE(processedJson.empty()) << "Responses should build processedJson"; // Verify it contains a messages array with the correct content From 008bd83c5cd3f31584e84c1338cd32a71bf75be6 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 19 Jun 2026 10:33:31 +0200 Subject: [PATCH 7/9] next step --- src/llm/servable.cpp | 72 ++++++++++++------- .../continuous_batching/servable.cpp | 32 ++++----- .../visual_language_model/legacy/servable.cpp | 26 ++++--- 3 files changed, 73 insertions(+), 57 deletions(-) diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index 5bef6cf2f4..ce571a0086 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -82,15 +82,12 @@ absl::Status detectImplicitReasoningStartIfNeeded( } absl::Status buildChatCompletionsInputText( + const CanonicalRequest& canonicalRequest, const std::shared_ptr& executionContext, const std::shared_ptr& properties, std::string& inputText) { #if (PYTHON_DISABLE == 0) - auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::PY_JINJA); - if (!canonicalRequest.ok()) { - return canonicalRequest.status(); - } - const auto* pyPath = std::get_if(canonicalRequest.value()); + const auto* pyPath = std::get_if(&canonicalRequest); if (pyPath == nullptr) { return absl::InternalError("Canonical request path mismatch for Python renderer"); } @@ -99,11 +96,7 @@ absl::Status buildChatCompletionsInputText( return status; } #else - auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); - if (!canonicalRequest.ok()) { - return canonicalRequest.status(); - } - const auto* cppPath = std::get_if(canonicalRequest.value()); + const auto* cppPath = std::get_if(&canonicalRequest); if (cppPath == nullptr) { return absl::InternalError("Canonical request path mismatch for C++ renderer"); } @@ -119,15 +112,12 @@ absl::Status buildChatCompletionsInputText( } absl::Status buildResponsesInputText( + const CanonicalRequest& canonicalRequest, const std::shared_ptr& executionContext, const std::shared_ptr& properties, std::string& inputText) { #if (PYTHON_DISABLE == 0) - auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::PY_JINJA); - if (!canonicalRequest.ok()) { - return canonicalRequest.status(); - } - const auto* pyPath = std::get_if(canonicalRequest.value()); + const auto* pyPath = std::get_if(&canonicalRequest); if (pyPath == nullptr) { return absl::InternalError("Canonical request path mismatch for Python renderer"); } @@ -136,11 +126,7 @@ absl::Status buildResponsesInputText( return status; } #else - auto canonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); - if (!canonicalRequest.ok()) { - return canonicalRequest.status(); - } - const auto* cppPath = std::get_if(canonicalRequest.value()); + const auto* cppPath = std::get_if(&canonicalRequest); if (cppPath == nullptr) { return absl::InternalError("Canonical request path mismatch for C++ renderer"); } @@ -165,17 +151,24 @@ absl::Status buildResponsesInputText( } absl::Status buildInputTextForEndpoint( + const CanonicalRequest& canonicalRequest, const std::shared_ptr& executionContext, const std::shared_ptr& properties, std::string& inputText) { switch (executionContext->endpoint) { case Endpoint::CHAT_COMPLETIONS: - return buildChatCompletionsInputText(executionContext, properties, inputText); + return buildChatCompletionsInputText(canonicalRequest, executionContext, properties, inputText); case Endpoint::RESPONSES: - return buildResponsesInputText(executionContext, properties, inputText); + return buildResponsesInputText(canonicalRequest, executionContext, properties, inputText); case Endpoint::COMPLETIONS: - inputText = executionContext->apiHandler->getPrompt().value(); - return absl::OkStatus(); + if (const auto* cppPath = std::get_if(&canonicalRequest)) { + if (!cppPath->rawPrompt.has_value()) { + return absl::Status(absl::StatusCode::kInvalidArgument, "input is missing"); + } + inputText = cppPath->rawPrompt.value(); + return absl::OkStatus(); + } + return absl::InternalError("Canonical request path mismatch for completions endpoint"); case Endpoint::TOKENIZE: return absl::InternalError("Tokenize endpoint should not reach prepareInputs stage"); } @@ -341,13 +334,40 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrapiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!cppCanonicalRequest.ok()) { + return cppCanonicalRequest.status(); + } + const auto* cppPath = std::get_if(cppCanonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + // Base servable cannot process images - if (executionContext->apiHandler->getImageHistory().size() > 0) { + if (cppPath->imageHistory.get().size() > 0) { return absl::InternalError("This servable supports only text input, but image_url has been provided"); } +#if (PYTHON_DISABLE == 0) + RendererType rendererType = (executionContext->endpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) + ? RendererType::PY_JINJA + : RendererType::CPP_TOKENIZER; +#else + RendererType rendererType = RendererType::CPP_TOKENIZER; +#endif + const CanonicalRequest* canonicalRequest = cppCanonicalRequest.value(); +#if (PYTHON_DISABLE == 0) + if (rendererType == RendererType::PY_JINJA) { + auto pyCanonicalRequest = executionContext->apiHandler->getCanonicalRequest(RendererType::PY_JINJA); + if (!pyCanonicalRequest.ok()) { + return pyCanonicalRequest.status(); + } + canonicalRequest = pyCanonicalRequest.value(); + } +#endif + std::string inputText; - auto inputTextStatus = buildInputTextForEndpoint(executionContext, properties, inputText); + auto inputTextStatus = buildInputTextForEndpoint(*canonicalRequest, executionContext, properties, inputText); if (!inputTextStatus.ok()) { return inputTextStatus; } diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index b7ccc377ce..1be2afae64 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "../../../config.hpp" @@ -72,39 +73,36 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { - ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); + auto canonicalRequest = vlmExecutionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); + } + const auto* cppPath = std::get_if(canonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + ov::genai::ChatHistory chatHistory = cppPath->chatHistory.get(); auto restrictedTagStatus = vlm::rejectRestrictedImageTags(chatHistory); if (!restrictedTagStatus.ok()) { return restrictedTagStatus; } - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); + const ImageHistory& imageHistory = cppPath->imageHistory.get(); auto imagePlacementStatus = vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, vlmExecutionContext->inputImages); if (!imagePlacementStatus.ok()) { return imagePlacementStatus; } - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); - } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = vlmExecutionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); if (llm_calculator_logger->should_log(spdlog::level::trace)) { SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory messages: {}", chatHistory.get_messages().to_json_string()); SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory.get_tools(): {}", chatHistory.get_tools().to_json_string()); SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatHistory.get_extra_context(): {}", chatHistory.get_extra_context().to_json_string()); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM tools: {}", tools.has_value() ? tools->to_json_string() : std::string("")); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatTemplateKwargs: {}", chatTemplateKwargs.has_value() ? chatTemplateKwargs->to_json_string() : std::string("")); - SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM addGenerationPrompt: {}", addGenerationPrompt); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM tools: {}", cppPath->tools.has_value() ? cppPath->tools->to_json_string() : std::string("")); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM chatTemplateKwargs: {}", cppPath->chatTemplateKwargs.has_value() ? cppPath->chatTemplateKwargs->to_json_string() : std::string("")); + SPDLOG_LOGGER_TRACE(llm_calculator_logger, "VLM addGenerationPrompt: {}", cppPath->addGenerationPrompt); } - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, cppPath->addGenerationPrompt, {}, cppPath->tools, cppPath->chatTemplateKwargs); if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); } diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 0a8c6fc3e2..8a97494eef 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "../../../logging.hpp" @@ -281,31 +282,28 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) { - ov::genai::ChatHistory& chatHistory = vlmExecutionContext->apiHandler->getChatHistory(); + auto canonicalRequest = vlmExecutionContext->apiHandler->getCanonicalRequest(RendererType::CPP_TOKENIZER); + if (!canonicalRequest.ok()) { + return canonicalRequest.status(); + } + const auto* cppPath = std::get_if(canonicalRequest.value()); + if (cppPath == nullptr) { + return absl::InternalError("Canonical request path mismatch for C++ renderer"); + } + ov::genai::ChatHistory chatHistory = cppPath->chatHistory.get(); auto restrictedTagStatus = vlm::rejectRestrictedImageTags(chatHistory); if (!restrictedTagStatus.ok()) { return restrictedTagStatus; } - const ImageHistory& imageHistory = vlmExecutionContext->apiHandler->getImageHistory(); + const ImageHistory& imageHistory = cppPath->imageHistory.get(); auto imagePlacementStatus = vlm::injectImageTagsAndCollectTensors(chatHistory, imageHistory, vlmExecutionContext->inputImages); if (!imagePlacementStatus.ok()) { return imagePlacementStatus; } - constexpr bool addGenerationPrompt = true; // confirm it should be hardcoded - auto toolsStatus = vlmExecutionContext->apiHandler->parseToolsToJsonContainer(); - if (!toolsStatus.ok()) { - return toolsStatus.status(); - } - const auto& tools = toolsStatus.value(); - auto chatTemplateKwargsStatus = vlmExecutionContext->apiHandler->parseChatTemplateKwargsToJsonContainer(); - if (!chatTemplateKwargsStatus.ok()) { - return chatTemplateKwargsStatus.status(); - } - const auto& chatTemplateKwargs = chatTemplateKwargsStatus.value(); - vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, addGenerationPrompt, {}, tools, chatTemplateKwargs); + vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, cppPath->addGenerationPrompt, {}, cppPath->tools, cppPath->chatTemplateKwargs); if (vlmExecutionContext->apiHandler->getOutputParser() != nullptr) { vlmExecutionContext->apiHandler->getOutputParser()->detectAndSetImplicitReasoningStart(vlmExecutionContext->inputText); } From 8e393edc3b1471c1eb16634d39a320c20b7082b2 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 19 Jun 2026 10:45:28 +0200 Subject: [PATCH 8/9] remove request.processedJson --- src/llm/apis/openai_api_handler.cpp | 3 ++- src/llm/apis/openai_completions.cpp | 5 ----- src/llm/apis/openai_request.hpp | 1 - 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index 3ca8eb94be..67462dc557 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -507,7 +507,8 @@ const std::string& OpenAIApiHandler::getProcessedJson() const { return pyPath->processedJson.get(); } } - return request.processedJson; + static const std::string EMPTY_JSON{}; + return EMPTY_JSON; } const ImageHistory& OpenAIApiHandler::getImageHistory() const { diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index dc19bd220f..209de33555 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -289,11 +289,6 @@ absl::StatusOr OpenAIChatCompletionsHandler::buildCanonicalReq return CanonicalRequest(std::move(cppPath)); } - if (request.processedJson.size() > 0) { - PyPath pyPath{std::cref(request.processedJson)}; - return CanonicalRequest(std::move(pyPath)); - } - StringBuffer buffer; Writer writer(buffer); doc.Accept(writer); diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index 24327be44f..02c31a509c 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -41,7 +41,6 @@ struct StreamOptions { // Class that maps OpenAI request content. struct OpenAIRequest { ov::genai::ChatHistory chatHistory; - std::string processedJson; ImageHistory imageHistory; std::optional prompt{std::nullopt}; bool stream{false}; From 614d52423e17eccfb740140dbe7f15bc3197fb73 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Fri, 19 Jun 2026 13:51:34 +0200 Subject: [PATCH 9/9] style --- src/llm/apis/openai_api_handler.cpp | 1 - src/llm/apis/openai_api_handler.hpp | 1 - src/llm/apis/openai_completions.cpp | 13 +- src/llm/apis/openai_request.hpp | 1 - src/llm/apis/openai_responses.cpp | 8 +- src/llm/servable.cpp | 7 +- src/test/http_openai_handler_test.cpp | 192 +++++++++++++------------- 7 files changed, 112 insertions(+), 111 deletions(-) diff --git a/src/llm/apis/openai_api_handler.cpp b/src/llm/apis/openai_api_handler.cpp index 67462dc557..e08ebdb32c 100644 --- a/src/llm/apis/openai_api_handler.cpp +++ b/src/llm/apis/openai_api_handler.cpp @@ -535,7 +535,6 @@ std::optional OpenAIApiHandler::getResponseFormat() const { return request.responseFormat; } -std::optional OpenAIApiHandler::getPrompt() const { return request.prompt; } std::optional OpenAIApiHandler::getNumReturnSequences() const { return request.numReturnSequences; } StreamOptions OpenAIApiHandler::getStreamOptions() const { return request.streamOptions; } diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 4391cad9fa..77cdc81042 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -161,7 +161,6 @@ class OpenAIApiHandler { // Accessors (non-virtual) const OpenAIRequest& getRequest() const; - std::optional getPrompt() const; std::optional getNumReturnSequences() const; StreamOptions getStreamOptions() const; const std::string& getProcessedJson() const; diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 209de33555..e65ffcad9e 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -75,11 +75,9 @@ absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() { if (it != doc.MemberEnd()) { if (!it->value.IsString()) { return absl::InvalidArgumentError("prompt is not a string"); - } else { - request.prompt = it->value.GetString(); } } - if (!request.prompt.has_value() || !request.prompt.value().size()) { + if (it == doc.MemberEnd() || it->value.GetStringLength() == 0) { return absl::Status(absl::StatusCode::kInvalidArgument, "prompt is missing"); } // logprobs: int; 1 value allowed @@ -279,12 +277,19 @@ absl::StatusOr OpenAIChatCompletionsHandler::buildCanonicalReq if (!kwargs.ok()) { return kwargs.status(); } + std::optional rawPrompt; + if (endpoint == Endpoint::COMPLETIONS) { + auto promptIt = doc.FindMember("prompt"); + if (promptIt != doc.MemberEnd() && promptIt->value.IsString()) { + rawPrompt = std::string(promptIt->value.GetString(), promptIt->value.GetStringLength()); + } + } CppPath cppPath{ std::cref(request.chatHistory), std::cref(request.imageHistory), std::move(tools.value()), std::move(kwargs.value()), - request.prompt, + std::move(rawPrompt), true}; return CanonicalRequest(std::move(cppPath)); } diff --git a/src/llm/apis/openai_request.hpp b/src/llm/apis/openai_request.hpp index 02c31a509c..47ba0370d1 100644 --- a/src/llm/apis/openai_request.hpp +++ b/src/llm/apis/openai_request.hpp @@ -42,7 +42,6 @@ struct StreamOptions { struct OpenAIRequest { ov::genai::ChatHistory chatHistory; ImageHistory imageHistory; - std::optional prompt{std::nullopt}; bool stream{false}; StreamOptions streamOptions; std::string model; diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index 6f68029007..4c90835e24 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -648,14 +648,14 @@ absl::Status OpenAIResponsesHandler::parseInput(std::optional allow } if (inputIt->value.IsString()) { - request.prompt = inputIt->value.GetString(); - if (request.prompt.value().empty()) { + const std::string inputText(inputIt->value.GetString(), inputIt->value.GetStringLength()); + if (inputText.empty()) { return absl::InvalidArgumentError("input cannot be empty"); } request.chatHistory.push_back({}); request.chatHistory.last()["role"] = "user"; - request.chatHistory.last()["content"] = request.prompt.value(); + request.chatHistory.last()["content"] = inputText; } else if (inputIt->value.IsArray()) { if (inputIt->value.GetArray().Size() == 0) { return absl::InvalidArgumentError("input array must not be empty"); @@ -786,7 +786,7 @@ absl::StatusOr OpenAIResponsesHandler::buildCanonicalRequestIm std::cref(request.imageHistory), std::move(tools.value()), std::move(kwargs.value()), - request.prompt, + std::nullopt, true}; return CanonicalRequest(std::move(cppPath)); } diff --git a/src/llm/servable.cpp b/src/llm/servable.cpp index ce571a0086..f230b3990c 100644 --- a/src/llm/servable.cpp +++ b/src/llm/servable.cpp @@ -59,7 +59,6 @@ absl::Status applyTokenizerChatTemplate( } #endif - #if (PYTHON_DISABLE == 0) absl::Status applyPythonChatTemplate( const PyPath& pyPath, @@ -124,7 +123,7 @@ absl::Status buildResponsesInputText( auto status = applyPythonChatTemplate(*pyPath, properties, inputText); if (!status.ok()) { return status; - } + } #else const auto* cppPath = std::get_if(&canonicalRequest); if (cppPath == nullptr) { @@ -350,8 +349,8 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptrendpoint == Endpoint::CHAT_COMPLETIONS || executionContext->endpoint == Endpoint::RESPONSES) - ? RendererType::PY_JINJA - : RendererType::CPP_TOKENIZER; + ? RendererType::PY_JINJA + : RendererType::CPP_TOKENIZER; #else RendererType rendererType = RendererType::CPP_TOKENIZER; #endif diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index f065b44a58..f42ab1377a 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -763,101 +763,101 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ParsingTextInputCreatesUser EXPECT_EQ(chatHistory[0]["role"], "user"); EXPECT_EQ(chatHistory[0]["content"], "What is OpenVINO?"); if (endpoint() == ovms::Endpoint::CHAT_COMPLETIONS) { - // Canonical PyPath should always provide a JSON payload for template processing. - EXPECT_FALSE(apiHandler->getProcessedJson().empty()); + // Canonical PyPath should always provide a JSON payload for template processing. + EXPECT_FALSE(apiHandler->getProcessedJson().empty()); } } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCppPathIsAvailable) { - std::string json = createTextRequest("What is OpenVINO?"); - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); - auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); - ASSERT_TRUE(canonicalRequest.ok()); - ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); + auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(canonicalRequest.ok()); + ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); - const auto& cppPath = std::get(*canonicalRequest.value()); - EXPECT_EQ(&cppPath.chatHistory.get(), &apiHandler->getChatHistory()); - EXPECT_EQ(&cppPath.imageHistory.get(), &apiHandler->getImageHistory()); + const auto& cppPath = std::get(*canonicalRequest.value()); + EXPECT_EQ(&cppPath.chatHistory.get(), &apiHandler->getChatHistory()); + EXPECT_EQ(&cppPath.imageHistory.get(), &apiHandler->getImageHistory()); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestPyPathIsAvailable) { - std::string json = createTextRequest("What is OpenVINO?"); - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); - - auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); - ASSERT_TRUE(canonicalRequest.ok()); - ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); - - const auto& pyPath = std::get(*canonicalRequest.value()); - EXPECT_FALSE(pyPath.processedJson.get().empty()); - - if (endpoint() == ovms::Endpoint::RESPONSES) { - rapidjson::Document processedDoc; - processedDoc.Parse(pyPath.processedJson.get().c_str()); - ASSERT_FALSE(processedDoc.HasParseError()); - ASSERT_TRUE(processedDoc.HasMember("messages")); - ASSERT_TRUE(processedDoc["messages"].IsArray()); - ASSERT_GE(processedDoc["messages"].Size(), 1u); - } + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto canonicalRequest = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(canonicalRequest.ok()); + ASSERT_TRUE(std::holds_alternative(*canonicalRequest.value())); + + const auto& pyPath = std::get(*canonicalRequest.value()); + EXPECT_FALSE(pyPath.processedJson.get().empty()); + + if (endpoint() == ovms::Endpoint::RESPONSES) { + rapidjson::Document processedDoc; + processedDoc.Parse(pyPath.processedJson.get().c_str()); + ASSERT_FALSE(processedDoc.HasParseError()); + ASSERT_TRUE(processedDoc.HasMember("messages")); + ASSERT_TRUE(processedDoc["messages"].IsArray()); + ASSERT_GE(processedDoc["messages"].Size(), 1u); + } } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, LegacyGettersRemainCompatibleWithCanonicalCache) { - std::string json = createTextRequest("What is OpenVINO?"); - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); - - auto& chatHistory = apiHandler->getChatHistory(); - const auto& imageHistory = apiHandler->getImageHistory(); - const auto& processedJson = apiHandler->getProcessedJson(); - - auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); - auto pyCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); - ASSERT_TRUE(cppCanonical.ok()); - ASSERT_TRUE(pyCanonical.ok()); - - ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); - ASSERT_TRUE(std::holds_alternative(*pyCanonical.value())); - - const auto& cppPath = std::get(*cppCanonical.value()); - const auto& pyPath = std::get(*pyCanonical.value()); - EXPECT_EQ(&cppPath.chatHistory.get(), &chatHistory); - EXPECT_EQ(&cppPath.imageHistory.get(), &imageHistory); - if (!processedJson.empty()) { - EXPECT_EQ(&pyPath.processedJson.get(), &processedJson); - } else { - EXPECT_FALSE(pyPath.processedJson.get().empty()); - } + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); + + auto& chatHistory = apiHandler->getChatHistory(); + const auto& imageHistory = apiHandler->getImageHistory(); + const auto& processedJson = apiHandler->getProcessedJson(); + + auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + auto pyCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(cppCanonical.ok()); + ASSERT_TRUE(pyCanonical.ok()); + + ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); + ASSERT_TRUE(std::holds_alternative(*pyCanonical.value())); + + const auto& cppPath = std::get(*cppCanonical.value()); + const auto& pyPath = std::get(*pyCanonical.value()); + EXPECT_EQ(&cppPath.chatHistory.get(), &chatHistory); + EXPECT_EQ(&cppPath.imageHistory.get(), &imageHistory); + if (!processedJson.empty()) { + EXPECT_EQ(&pyPath.processedJson.get(), &processedJson); + } else { + EXPECT_FALSE(pyPath.processedJson.get().empty()); + } } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCacheReturnsStableAddressPerRenderer) { - std::string json = createTextRequest("What is OpenVINO?"); - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); + std::string json = createTextRequest("What is OpenVINO?"); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); - auto cppCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); - auto cppCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); - ASSERT_TRUE(cppCanonicalFirst.ok()); - ASSERT_TRUE(cppCanonicalSecond.ok()); - EXPECT_EQ(cppCanonicalFirst.value(), cppCanonicalSecond.value()); + auto cppCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + auto cppCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(cppCanonicalFirst.ok()); + ASSERT_TRUE(cppCanonicalSecond.ok()); + EXPECT_EQ(cppCanonicalFirst.value(), cppCanonicalSecond.value()); - auto pyCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); - auto pyCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); - ASSERT_TRUE(pyCanonicalFirst.ok()); - ASSERT_TRUE(pyCanonicalSecond.ok()); - EXPECT_EQ(pyCanonicalFirst.value(), pyCanonicalSecond.value()); + auto pyCanonicalFirst = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + auto pyCanonicalSecond = apiHandler->getCanonicalRequest(ovms::RendererType::PY_JINJA); + ASSERT_TRUE(pyCanonicalFirst.ok()); + ASSERT_TRUE(pyCanonicalSecond.ok()); + EXPECT_EQ(pyCanonicalFirst.value(), pyCanonicalSecond.value()); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCppPathContainsTemplateInputs) { - if (endpoint() != ovms::Endpoint::CHAT_COMPLETIONS) { - GTEST_SKIP() << "Tools/chat_template_kwargs assertions apply to chat/completions flow"; - } + if (endpoint() != ovms::Endpoint::CHAT_COMPLETIONS) { + GTEST_SKIP() << "Tools/chat_template_kwargs assertions apply to chat/completions flow"; + } - std::string json = createTextRequest( - "What is OpenVINO?", - R"(, + std::string json = createTextRequest( + "What is OpenVINO?", + R"(, "tools": [ { "type": "function", @@ -875,21 +875,21 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, CanonicalRequestCppPathCont ], "chat_template_kwargs": {"enable_thinking": true} )"); - auto apiHandler = parseCurrentRequest(json); - ASSERT_NE(apiHandler, nullptr); + auto apiHandler = parseCurrentRequest(json); + ASSERT_NE(apiHandler, nullptr); - auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); - ASSERT_TRUE(cppCanonical.ok()); - ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); + auto cppCanonical = apiHandler->getCanonicalRequest(ovms::RendererType::CPP_TOKENIZER); + ASSERT_TRUE(cppCanonical.ok()); + ASSERT_TRUE(std::holds_alternative(*cppCanonical.value())); - const auto& cppPath = std::get(*cppCanonical.value()); - EXPECT_TRUE(cppPath.addGenerationPrompt); - ASSERT_TRUE(cppPath.tools.has_value()); - ASSERT_TRUE(cppPath.chatTemplateKwargs.has_value()); + const auto& cppPath = std::get(*cppCanonical.value()); + EXPECT_TRUE(cppPath.addGenerationPrompt); + ASSERT_TRUE(cppPath.tools.has_value()); + ASSERT_TRUE(cppPath.chatTemplateKwargs.has_value()); - const auto& kwargs = cppPath.chatTemplateKwargs.value(); - ASSERT_TRUE(kwargs["enable_thinking"].as_bool().has_value()); - EXPECT_TRUE(kwargs["enable_thinking"].as_bool().value()); + const auto& kwargs = cppPath.chatTemplateKwargs.value(); + ASSERT_TRUE(kwargs["enable_thinking"].as_bool().has_value()); + EXPECT_TRUE(kwargs["enable_thinking"].as_bool().value()); } TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquivalentMessages) { @@ -921,7 +921,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonContainsEquiva } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -966,7 +966,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonEquivalentMult } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -992,7 +992,7 @@ TEST_P(HttpOpenAIHandlerChatAndResponsesParsingTest, ProcessedJsonIncludesToolsW } #else if (endpoint() == ovms::Endpoint::RESPONSES) { - EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; + EXPECT_FALSE(apiHandler->getProcessedJson().empty()) << "Canonical PyPath should provide JSON regardless of Python build mode"; } #endif } @@ -5571,7 +5571,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageHistoryIndexMatchesChatHistor EXPECT_LT(turnIndex, chatHistory.size()); } - TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsRejectsRestrictedImageTag) { +TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsRejectsRestrictedImageTag) { std::string json = R"({ "model": "llama", "messages": [{"role": "user", "content": "prefix "}] @@ -5587,19 +5587,19 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageHistoryIndexMatchesChatHistor auto status = ovms::vlm::rejectRestrictedImageTags(apiHandler->getChatHistory()); EXPECT_EQ(status, absl::InvalidArgumentError("Message contains restricted tag")); - } +} - TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsInjectsImageTagsAndCollectsTensors) { +TEST_F(HttpOpenAIHandlerParsingTest, VlmImagePromptUtilsInjectsImageTagsAndCollectsTensors) { const std::string base64Image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAAEElEQVR4nGLK27oAEAAA//8DYAHGgEvy5AAAAABJRU5ErkJggg=="; - std::string json = R"({ + std::string json = R"({ "model": "llama", "messages": [ {"role":"user","content":[ {"type":"text","text":"what is in these images?"}, {"type":"image_url","image_url":{"url":")" + - base64Image + R"("}}, + base64Image + R"("}}, {"type":"image_url","image_url":{"url":")" + - base64Image + R"("}} + base64Image + R"("}} ]} ] })"; @@ -5625,7 +5625,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, ResponsesImageHistoryIndexMatchesChatHistor std::string content = chatHistory[0]["content"].as_string().value_or(""); EXPECT_THAT(content, ::testing::HasSubstr("\n\n")); EXPECT_THAT(content, ::testing::HasSubstr("what is in these images?")); - } +} // --- Tools normalisation edge cases ---