From de095432e00afd951386e68bad4352e1c74e6150 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 29 Apr 2026 14:55:57 +0200 Subject: [PATCH 01/10] Support finish reason in legacy pipelines --- src/llm/apis/openai_completions.cpp | 13 ++++++++----- src/llm/apis/openai_responses.cpp | 18 ++++++++++++++++-- src/llm/language_model/legacy/servable.cpp | 6 +++++- .../visual_language_model/legacy/servable.cpp | 6 +++++- src/test/llm/llmnode_test.cpp | 2 +- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 810fcdc50a..8a9590d11f 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -409,17 +409,18 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco // choices: array of size N, where N is related to n request parameter jsonResponse.StartArray("choices"); - int index = 0; - for (int i = 0; i < results.tokens.size(); i++) { + for (size_t i = 0; i < results.tokens.size(); ++i) { const std::vector& tokens = results.tokens[i]; SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens); ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); jsonResponse.StartObject(); // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls - auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty()); + const ov::genai::GenerationFinishReason finishReasonRaw = + (!results.finish_reasons.empty()) ? results.finish_reasons[0] : ov::genai::GenerationFinishReason::STOP; + auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway - jsonResponse.Index(index++); + jsonResponse.Index(static_cast(i)); if (endpoint == Endpoint::CHAT_COMPLETIONS) { jsonResponse.MessageObject(parsedOutput); @@ -481,7 +482,9 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens); jsonResponse.StartObject(); // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls - auto finishReason = mapFinishReason(ov::genai::GenerationFinishReason::STOP, !parsedOutput.toolCalls.empty()); + const ov::genai::GenerationFinishReason finishReasonRaw = + (!results.finish_reasons.empty()) ? results.finish_reasons[0] : ov::genai::GenerationFinishReason::STOP; + auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway jsonResponse.Index(index++); diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index 60ec1c4f08..76708be7e1 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -653,10 +653,17 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); std::vector parsedOutputs; + ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; for (const auto& tokens : results.tokens) { parsedOutputs.push_back(parseOutputIfNeeded(tokens)); } - return serializeUnaryResponseImpl(parsedOutputs); + for (const auto& finishReason : results.finish_reasons) { + if (finishReason == ov::genai::GenerationFinishReason::LENGTH) { + responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH; + break; + } + } + return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason); } std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecodedResults& results, const std::string& textResponse) { @@ -677,7 +684,14 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded parsedOutputs.push_back(std::move(output)); } } - return serializeUnaryResponseImpl(parsedOutputs); + ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; + for (const auto& finishReason : results.finish_reasons) { + if (finishReason == ov::genai::GenerationFinishReason::LENGTH) { + responsesFinishReason = ov::genai::GenerationFinishReason::LENGTH; + break; + } + } + return serializeUnaryResponseImpl(parsedOutputs, responsesFinishReason); } // --- Streaming event building blocks --- diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index 4234088a2a..3ae0055530 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -229,7 +229,11 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptrlastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; } - std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP); + ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP; + if (!legacyExecutionContext->results.finish_reasons.empty()) { + finishReason = legacyExecutionContext->results.finish_reasons[0]; + } + std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); } diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 1bb2367001..a5384a750e 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -245,7 +245,11 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar if (!executionContext->lastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; } - std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, ov::genai::GenerationFinishReason::STOP); + ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP; + if (!legacyExecutionContext->results.finish_reasons.empty()) { + finishReason = legacyExecutionContext->results.finish_reasons[0]; + } + std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); } diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index 99d9e8743c..fb47f53751 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -2685,7 +2685,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( // params: model name, generate expected output, check logprobs, check finish reason, test speculative decoding, supports empty handshake msg TestParameters{"lm_cb_regular", true, true, true, false, true}, - TestParameters{"lm_legacy_regular", false, false, false, false, false}, + TestParameters{"lm_legacy_regular", false, false, true, false, false}, TestParameters{"vlm_cb_regular", false, true, true, false, true}, TestParameters{"vlm_legacy_regular", false, false, false, false, false})); From 25a10fe11bfa6b66c0e5c87f4edc34a23f8d87bb Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 11 May 2026 11:42:27 +0200 Subject: [PATCH 02/10] fix --- src/llm/apis/openai_completions.cpp | 14 ++++++++++---- src/llm/apis/openai_responses.cpp | 6 ++++++ src/llm/language_model/legacy/servable.cpp | 7 ++++--- src/llm/visual_language_model/legacy/servable.cpp | 7 ++++--- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 8a9590d11f..bc5b297f73 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -415,8 +415,11 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); jsonResponse.StartObject(); // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls - const ov::genai::GenerationFinishReason finishReasonRaw = - (!results.finish_reasons.empty()) ? results.finish_reasons[0] : ov::genai::GenerationFinishReason::STOP; + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary LM generation result"); + } + // Current generation flow uses batch=1, so only finish_reasons[0] is expected here. + const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0]; auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway @@ -482,8 +485,11 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens); jsonResponse.StartObject(); // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls - const ov::genai::GenerationFinishReason finishReasonRaw = - (!results.finish_reasons.empty()) ? results.finish_reasons[0] : ov::genai::GenerationFinishReason::STOP; + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary VLM generation result"); + } + // Current generation flow uses batch=1, so only finish_reasons[0] is expected here. + const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0]; auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index 76708be7e1..e3ac155a03 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -652,6 +652,9 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary LM responses generation result"); + } std::vector parsedOutputs; ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; for (const auto& tokens : results.tokens) { @@ -670,6 +673,9 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded OVMS_PROFILE_FUNCTION(); usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); + if (results.finish_reasons.empty()) { + throw std::runtime_error("Missing finish reason in unary VLM responses generation result"); + } // Usage is already correctly set from perf_metrics above — no need for updateUsage. std::vector parsedOutputs; if (!textResponse.empty()) { diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index 3ae0055530..a3ac669565 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -229,10 +229,11 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptrlastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; } - ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP; - if (!legacyExecutionContext->results.finish_reasons.empty()) { - finishReason = legacyExecutionContext->results.finish_reasons[0]; + if (legacyExecutionContext->results.finish_reasons.empty()) { + return absl::InternalError("Missing finish reason in legacy LM streaming generation result"); } + // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0. + ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0]; std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index a5384a750e..798b6af741 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -245,10 +245,11 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar if (!executionContext->lastStreamerCallbackOutput.empty()) { lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; } - ov::genai::GenerationFinishReason finishReason = ov::genai::GenerationFinishReason::STOP; - if (!legacyExecutionContext->results.finish_reasons.empty()) { - finishReason = legacyExecutionContext->results.finish_reasons[0]; + if (legacyExecutionContext->results.finish_reasons.empty()) { + return absl::InternalError("Missing finish reason in legacy VLM streaming generation result"); } + // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0. + ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0]; std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); From 84a262274060dc26e283bf9ebdc921f72145aeff Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Mon, 11 May 2026 12:29:43 +0200 Subject: [PATCH 03/10] fix --- src/llm/apis/openai_completions.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index bc5b297f73..4d6b5d3eaf 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -414,7 +414,6 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens); ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); jsonResponse.StartObject(); - // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls if (results.finish_reasons.empty()) { throw std::runtime_error("Missing finish reason in unary LM generation result"); } @@ -484,7 +483,6 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", generatedTokens); ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens); jsonResponse.StartObject(); - // finish_reason: "stop" in regular scenario, "tool_calls" if output contains tool calls if (results.finish_reasons.empty()) { throw std::runtime_error("Missing finish reason in unary VLM generation result"); } From 95abd07b8a29aedf5dc6ad9cc01c13ef4d63cbc2 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Wed, 13 May 2026 12:00:40 +0200 Subject: [PATCH 04/10] fix --- src/test/http_openai_handler_test.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index c3a40cba3c..98aeb231b5 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1018,6 +1018,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseEncodedResultsReturns ov::genai::EncodedResults results; results.tokens = {createHermes3ToolCallTokens(*tokenizer)}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; @@ -1049,6 +1050,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ov::genai::VLMDecodedResults results; std::string toolCall = R"({"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}})"; results.texts = {toolCall}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results, toolCall); ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; @@ -1076,6 +1078,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsO ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -1107,6 +1110,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsR ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -1145,6 +1149,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesOmitsReas ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -2741,6 +2746,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -2786,6 +2792,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"tool_choice\":{"), std::string::npos) << serialized; @@ -3095,6 +3102,7 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeUnaryResponseVLMDecodedResultsWith std::string vlmText = "I will call a tool.{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}"; results.texts.push_back(vlmText); + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results, vlmText); From 439f99ff5eaa95e71dea982019ebce6a34cab352 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 14 May 2026 13:01:32 +0200 Subject: [PATCH 05/10] fix --- src/llm/apis/openai_completions.cpp | 8 ++++---- src/llm/apis/openai_responses.cpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 4d6b5d3eaf..498f9fe543 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -415,10 +415,10 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); jsonResponse.StartObject(); if (results.finish_reasons.empty()) { - throw std::runtime_error("Missing finish reason in unary LM generation result"); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM generation result, defaulting to STOP"); } // Current generation flow uses batch=1, so only finish_reasons[0] is expected here. - const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0]; + const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : results.finish_reasons[0]; auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway @@ -484,10 +484,10 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::VLMD ParsedOutput parsedOutput = parseOutputIfNeeded(generatedTokens); jsonResponse.StartObject(); if (results.finish_reasons.empty()) { - throw std::runtime_error("Missing finish reason in unary VLM generation result"); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary VLM generation result, defaulting to STOP"); } // Current generation flow uses batch=1, so only finish_reasons[0] is expected here. - const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons[0]; + const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : results.finish_reasons[0]; auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway diff --git a/src/llm/apis/openai_responses.cpp b/src/llm/apis/openai_responses.cpp index e3ac155a03..89b897dc4a 100644 --- a/src/llm/apis/openai_responses.cpp +++ b/src/llm/apis/openai_responses.cpp @@ -653,7 +653,7 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::EncodedRes usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); if (results.finish_reasons.empty()) { - throw std::runtime_error("Missing finish reason in unary LM responses generation result"); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM responses generation result, defaulting to STOP"); } std::vector parsedOutputs; ov::genai::GenerationFinishReason responsesFinishReason = ov::genai::GenerationFinishReason::STOP; @@ -674,7 +674,7 @@ std::string OpenAIResponsesHandler::serializeUnaryResponse(ov::genai::VLMDecoded usage.promptTokens = results.perf_metrics.get_num_input_tokens(); usage.completionTokens = results.perf_metrics.get_num_generated_tokens(); if (results.finish_reasons.empty()) { - throw std::runtime_error("Missing finish reason in unary VLM responses generation result"); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary VLM responses generation result, defaulting to STOP"); } // Usage is already correctly set from perf_metrics above — no need for updateUsage. std::vector parsedOutputs; From 7940fffe59cf8d20285f7e71e8b267dc3bf158ed Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 14 May 2026 14:18:56 +0200 Subject: [PATCH 06/10] fix --- src/test/http_openai_handler_test.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index 98aeb231b5..c3a40cba3c 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1018,7 +1018,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseEncodedResultsReturns ov::genai::EncodedResults results; results.tokens = {createHermes3ToolCallTokens(*tokenizer)}; - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; @@ -1050,7 +1049,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseVLMSupportsToolCallsF ov::genai::VLMDecodedResults results; std::string toolCall = R"({"name": "example_tool", "arguments": {"arg1": "value1", "arg2": 42}})"; results.texts = {toolCall}; - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results, toolCall); ASSERT_NE(serialized.find("\"finish_reason\":\"tool_calls\""), std::string::npos) << serialized; @@ -1078,7 +1076,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsO ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -1110,7 +1107,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesContainsR ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -1149,7 +1145,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesOmitsReas ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -2746,7 +2741,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"object\":\"response\""), std::string::npos) << serialized; @@ -2792,7 +2786,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeResponsesUnaryResponseContainsFunc ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results); ASSERT_NE(serialized.find("\"tool_choice\":{"), std::string::npos) << serialized; @@ -3102,7 +3095,6 @@ TEST_F(HttpOpenAIHandlerParsingTest, SerializeUnaryResponseVLMDecodedResultsWith std::string vlmText = "I will call a tool.{\"name\":\"get_weather\",\"arguments\":{\"location\":\"Paris\"}}"; results.texts.push_back(vlmText); - results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; std::string serialized = apiHandler->serializeUnaryResponse(results, vlmText); From f97fd662744ca73b7df89f5077bf4a05cbfde82c Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 14 May 2026 16:20:04 +0200 Subject: [PATCH 07/10] fix --- src/llm/language_model/legacy/servable.cpp | 4 ++-- src/llm/visual_language_model/legacy/servable.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llm/language_model/legacy/servable.cpp b/src/llm/language_model/legacy/servable.cpp index a3ac669565..8e244df219 100644 --- a/src/llm/language_model/legacy/servable.cpp +++ b/src/llm/language_model/legacy/servable.cpp @@ -230,10 +230,10 @@ absl::Status LegacyServable::preparePartialResponse(std::shared_ptrlastStreamerCallbackOutput; } if (legacyExecutionContext->results.finish_reasons.empty()) { - return absl::InternalError("Missing finish reason in legacy LM streaming generation result"); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy LM streaming generation result, defaulting to STOP"); } // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0. - ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0]; + ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0]; std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 798b6af741..a40dee296e 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -246,10 +246,10 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar lastTextChunk = lastTextChunk + executionContext->lastStreamerCallbackOutput; } if (legacyExecutionContext->results.finish_reasons.empty()) { - return absl::InternalError("Missing finish reason in legacy VLM streaming generation result"); + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in legacy VLM streaming generation result, defaulting to STOP"); } // Legacy generation path always runs with batch=1, so we read the single finish reason at index 0. - ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons[0]; + ov::genai::GenerationFinishReason finishReason = legacyExecutionContext->results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : legacyExecutionContext->results.finish_reasons[0]; std::string serializedChunk = executionContext->apiHandler->serializeStreamingChunk(lastTextChunk, finishReason); if (!serializedChunk.empty()) { executionContext->response = wrapTextInServerSideEventMessage(serializedChunk); From 17f419d8b3bb1351632b419b4ba6f44361f405bb Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 14 May 2026 16:32:20 +0200 Subject: [PATCH 08/10] fix --- src/llm/apis/openai_completions.cpp | 12 +++++++----- src/test/llm/llmnode_test.cpp | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 498f9fe543..20ccfe372f 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -409,16 +409,18 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(ov::genai::Enco // choices: array of size N, where N is related to n request parameter jsonResponse.StartArray("choices"); + if (results.finish_reasons.empty()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM generation result, defaulting to STOP for all choices"); + } else if (results.finish_reasons.size() != results.tokens.size()) { + SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Finish reasons size ({}) does not match tokens size ({}) in unary LM generation result, defaulting missing entries to STOP", + results.finish_reasons.size(), results.tokens.size()); + } for (size_t i = 0; i < results.tokens.size(); ++i) { const std::vector& tokens = results.tokens[i]; SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens); ParsedOutput parsedOutput = parseOutputIfNeeded(tokens); jsonResponse.StartObject(); - if (results.finish_reasons.empty()) { - SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Missing finish reason in unary LM generation result, defaulting to STOP"); - } - // Current generation flow uses batch=1, so only finish_reasons[0] is expected here. - const ov::genai::GenerationFinishReason finishReasonRaw = results.finish_reasons.empty() ? ov::genai::GenerationFinishReason::STOP : results.finish_reasons[0]; + const ov::genai::GenerationFinishReason finishReasonRaw = i < results.finish_reasons.size() ? results.finish_reasons[i] : ov::genai::GenerationFinishReason::STOP; auto finishReason = mapFinishReason(finishReasonRaw, !parsedOutput.toolCalls.empty()); jsonResponse.FinishReason(finishReason.value_or("unknown")); // index: integer; Choice index, only n=1 supported anyway diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index fb47f53751..0e52c1bd1f 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -2687,7 +2687,7 @@ INSTANTIATE_TEST_SUITE_P( TestParameters{"lm_cb_regular", true, true, true, false, true}, TestParameters{"lm_legacy_regular", false, false, true, false, false}, TestParameters{"vlm_cb_regular", false, true, true, false, true}, - TestParameters{"vlm_legacy_regular", false, false, false, false, false})); + TestParameters{"vlm_legacy_regular", false, false, true, false, false})); const std::string validRequestBodyWithParameter(const std::string& modelName, const std::string& parameter, const std::string& value) { std::string requestBody = R"( @@ -3611,7 +3611,7 @@ INSTANTIATE_TEST_SUITE_P( TestParameters{"lm_cb_regular", true, true, true, false, true}, TestParameters{"lm_legacy_regular", false, false, false, false, false}, TestParameters{"vlm_cb_regular", false, true, true, false, true}, - TestParameters{"vlm_legacy_regular", false, false, false, false, false})); + TestParameters{"vlm_legacy_regular", false, false, true, false, false})); // Common tests for all pipeline types (testing logic executed prior pipeline type selection) class LLMConfigHttpTest : public ::testing::Test {}; From f4f19a20073b13e5151af32f8072c2a30a65e5e5 Mon Sep 17 00:00:00 2001 From: Michal Kulakowski Date: Thu, 14 May 2026 17:36:47 +0200 Subject: [PATCH 09/10] uts --- src/test/http_openai_handler_test.cpp | 163 ++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index c3a40cba3c..f1c9c1e775 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1713,6 +1713,169 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesCompleted ASSERT_NE(serialized.find("\"metadata\":{}"), std::string::npos) << serialized; } +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesEncodedResultsIncompleteOnLength) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + + ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized; + ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized; + ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesEncodedResultsCompletedOnStop) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + + ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized; + ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesVLMDecodedResultsIncompleteOnLength) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::VLMDecodedResults results; + std::string text = "OVMS"; + results.texts = {text}; + results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH}; + + std::string serialized = apiHandler->serializeUnaryResponse(results, text); + + ASSERT_NE(serialized.find("\"status\":\"incomplete\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"reason\":\"max_tokens\""), std::string::npos) << serialized; + ASSERT_EQ(serialized.find("\"completed_at\""), std::string::npos) << serialized; + ASSERT_EQ(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesVLMDecodedResultsCompletedOnStop) { + std::string json = R"({ + "model": "llama", + "input": "What is OpenVINO?", + "max_output_tokens": 5 + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::RESPONSES, std::chrono::system_clock::now(), *tokenizer); + std::optional maxTokensLimit; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::VLMDecodedResults results; + std::string text = "OVMS"; + results.texts = {text}; + results.finish_reasons = {ov::genai::GenerationFinishReason::STOP}; + + std::string serialized = apiHandler->serializeUnaryResponse(results, text); + + ASSERT_NE(serialized.find("\"status\":\"completed\""), std::string::npos) << serialized; + ASSERT_NE(serialized.find("\"completed_at\""), std::string::npos) << serialized; + ASSERT_EQ(serialized.find("\"incomplete_details\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseChatCompletionsEncodedResultsLengthFinishReason) { + std::string json = R"({ + "model": "llama", + "stream": false, + "messages": [{"role": "user", "content": "What is OpenVINO?"}] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::EncodedResults results; + ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + int64_t* outputIdsData = reinterpret_cast(outputIds.data()); + results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH}; + + std::string serialized = apiHandler->serializeUnaryResponse(results); + ASSERT_NE(serialized.find("\"finish_reason\":\"length\""), std::string::npos) << serialized; +} + +TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseChatCompletionsVLMDecodedResultsLengthFinishReason) { + std::string json = R"({ + "model": "llama", + "stream": false, + "messages": [{"role": "user", "content": "What is OpenVINO?"}] + })"; + doc.Parse(json.c_str()); + ASSERT_FALSE(doc.HasParseError()); + + auto apiHandler = std::make_shared(doc, ovms::Endpoint::CHAT_COMPLETIONS, std::chrono::system_clock::now(), *tokenizer); + uint32_t maxTokensLimit = 100; + uint32_t bestOfLimit = 0; + std::optional maxModelLength; + ASSERT_EQ(apiHandler->parseRequest(maxTokensLimit, bestOfLimit, maxModelLength), absl::OkStatus()); + + ov::genai::VLMDecodedResults results; + std::string text = "OVMS"; + results.texts = {text}; + results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH}; + + std::string serialized = apiHandler->serializeUnaryResponse(results, text); + ASSERT_NE(serialized.find("\"finish_reason\":\"length\""), std::string::npos) << serialized; +} + TEST_F(HttpOpenAIHandlerParsingTest, ParsingMessagesSucceedsBase64) { std::string json = R"({ "model": "llama", From 2d9d4815b791c823db6d23cc632fb41c818e12ee Mon Sep 17 00:00:00 2001 From: michalkulakowski Date: Thu, 14 May 2026 17:48:09 +0200 Subject: [PATCH 10/10] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/test/http_openai_handler_test.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/test/http_openai_handler_test.cpp b/src/test/http_openai_handler_test.cpp index f1c9c1e775..3505379522 100644 --- a/src/test/http_openai_handler_test.cpp +++ b/src/test/http_openai_handler_test.cpp @@ -1730,8 +1730,12 @@ TEST_F(HttpOpenAIHandlerParsingTest, serializeUnaryResponseForResponsesEncodedRe ov::genai::EncodedResults results; ov::Tensor outputIds = tokenizer->encode("OVMS", ov::genai::add_special_tokens(false)).input_ids; + const auto& shape = outputIds.get_shape(); + ASSERT_EQ(shape.size(), 2); + ASSERT_EQ(shape[0], 1); + ASSERT_EQ(outputIds.get_element_type(), ov::element::i64); int64_t* outputIdsData = reinterpret_cast(outputIds.data()); - results.tokens = {std::vector(outputIdsData, outputIdsData + outputIds.get_shape()[1])}; + results.tokens = {std::vector(outputIdsData, outputIdsData + shape[1])}; results.finish_reasons = {ov::genai::GenerationFinishReason::LENGTH}; std::string serialized = apiHandler->serializeUnaryResponse(results);