From c45573c49fba98ff8e8a16fc95b3fb5be9e38445 Mon Sep 17 00:00:00 2001 From: exzile Date: Fri, 26 Jun 2026 14:15:38 -0400 Subject: [PATCH] Propagate global --cache_dir to continuous batching pipeline The continuous batching servable initializer constructs the GenAI ContinuousBatchingPipeline directly and never applied the server-level --cache_dir (ServerSettings.cacheDir). Unlike the non-CB path, which applies it via ModelInstance::setCacheOptions, the CB path left model compilation caching disabled unless the user duplicated the value into the node's plugin_config as CACHE_DIR. As a result, .blob/.cl_cache artifacts were never persisted and every restart fully recompiled the model. Inject the global cache_dir into the pipeline plugin config before constructing the pipeline. An explicit CACHE_DIR in the node's plugin_config remains authoritative. Adds a regression test (LLMNodeOptionsCacheDirPropagation) covering both propagation of the global value and precedence of an explicit node value. Fixes #4230 Co-Authored-By: Claude Opus 4.8 --- docs/model_cache.md | 2 + .../servable_initializer.cpp | 15 +++ src/test/llm/llmnode_test.cpp | 120 ++++++++++++++++++ 3 files changed, 137 insertions(+) diff --git a/docs/model_cache.md b/docs/model_cache.md index 46373ae113..2174ca7f26 100644 --- a/docs/model_cache.md +++ b/docs/model_cache.md @@ -23,6 +23,8 @@ Alternatively the location of the cache storage can be set using the parameter ` The model server security context must have read-write access to the cache storage path. +`--cache_dir` also applies to LLM text-generation servables using the continuous batching pipeline (GPU). With it set, the compiled-model/blob cache is persisted across restarts, so a model that has already been compiled (or idle-unloaded) reloads from the cache instead of recompiling. An explicit `CACHE_DIR` in a node's `plugin_config` takes precedence over the global `--cache_dir`. + When using Model Server with configuration file, it is possible to serve more than one model. In such case, model cache is applied to all the models, with an exception to: - Models with custom loader (for security reasons explained earlier) - Models configured to shape `auto` or batch_size `auto` diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp index 1aaff99844..c59c94bf71 100644 --- a/src/llm/language_model/continuous_batching/servable_initializer.cpp +++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp @@ -227,6 +227,21 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptrpluginConfig.find(ov::cache_dir.name()) == properties->pluginConfig.end()) { + properties->pluginConfig[ov::cache_dir.name()] = globalCacheDir; + SPDLOG_DEBUG("Applying global cache_dir to continuous batching pipeline: {}", globalCacheDir); + } else { + SPDLOG_DEBUG("CACHE_DIR set explicitly in node plugin_config; keeping user value over global cache_dir"); + } + } + if (properties->device == "CPU") { status = applyDefaultCpuProperties(properties->pluginConfig); if (!status.ok()) { diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index e13cf29919..5f8b764e84 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -37,6 +37,7 @@ #endif #include "../../http_rest_api_handler.hpp" +#include "../../config.hpp" #include "../../http_status_code.hpp" #include "../../json_parser.hpp" #include "../../llm/apis/openai_completions.hpp" @@ -4425,6 +4426,125 @@ TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCheckPluginConfig) { LLMNodeOptionsCheckPluginConfig(modelsPath); } +// Verifies that the global --cache_dir (ServerSettings) is propagated into the +// continuous batching pipeline plugin config, and that an explicit CACHE_DIR in +// the node's plugin_config takes precedence over the global value. +// Regression test for openvinotoolkit/model_server#4230. +void LLMNodeOptionsCacheDirPropagation(std::string& modelsPath) { + // Seed the global cache_dir via the CLI parser (same path used in production). + char* n_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080", (char*)"--cache_dir", (char*)"/tmp/ovms_global_cache"}; + int arg_count = 9; + ovms::Config::instance().parse(arg_count, n_argv); + ASSERT_EQ(ovms::Config::instance().cacheDir(), "/tmp/ovms_global_cache"); + + // Case 1: no CACHE_DIR in node plugin_config -> global value is applied. + { + std::string testPbtxt = R"( + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + + node: { + name: "llmNode" + calculator: "HttpLLMCalculator" + input_stream: "LOOPBACK:loopback" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + input_side_packet: "LLM_NODE_RESOURCES:llm" + output_stream: "LOOPBACK:loopback" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + input_stream_info: { + tag_index: 'LOOPBACK:0', + back_edge: true + } + node_options: { + [type.googleapis.com / mediapipe.LLMCalculatorOptions]: { + models_path: ")" + + modelsPath + R"(" + } + } + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler", + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "LOOPBACK:0" + } + } + } + } + } + )"; + adjustConfigForTargetPlatform(testPbtxt); + ::mediapipe::CalculatorGraphConfig config; + ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config)); + std::shared_ptr servable; + ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK); + auto properties = std::static_pointer_cast(servable->getProperties()); + ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1); + ASSERT_EQ(properties->pluginConfig["CACHE_DIR"].as(), "/tmp/ovms_global_cache"); + } + + // Case 2: explicit CACHE_DIR in node plugin_config wins over the global value. + { + std::string testPbtxt = R"( + input_stream: "HTTP_REQUEST_PAYLOAD:input" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + + node: { + name: "llmNode" + calculator: "HttpLLMCalculator" + input_stream: "LOOPBACK:loopback" + input_stream: "HTTP_REQUEST_PAYLOAD:input" + input_side_packet: "LLM_NODE_RESOURCES:llm" + output_stream: "LOOPBACK:loopback" + output_stream: "HTTP_RESPONSE_PAYLOAD:output" + input_stream_info: { + tag_index: 'LOOPBACK:0', + back_edge: true + } + node_options: { + [type.googleapis.com / mediapipe.LLMCalculatorOptions]: { + models_path: ")" + + modelsPath + R"(" + plugin_config: '{"CACHE_DIR": "/tmp/ovms_node_cache"}' + } + } + input_stream_handler { + input_stream_handler: "SyncSetInputStreamHandler", + options { + [mediapipe.SyncSetInputStreamHandlerOptions.ext] { + sync_set { + tag_index: "LOOPBACK:0" + } + } + } + } + } + )"; + adjustConfigForTargetPlatform(testPbtxt); + ::mediapipe::CalculatorGraphConfig config; + ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config)); + std::shared_ptr servable; + ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK); + auto properties = std::static_pointer_cast(servable->getProperties()); + ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1); + // The test harness may rewrite the path for the target platform, so match + // on substrings: the explicit node value must win over the global one. + std::string nodeCacheDir = properties->pluginConfig["CACHE_DIR"].as(); + ASSERT_NE(nodeCacheDir.find("ovms_node_cache"), std::string::npos) << "Explicit node CACHE_DIR should be used, got: " << nodeCacheDir; + ASSERT_EQ(nodeCacheDir.find("ovms_global_cache"), std::string::npos) << "Global cache_dir must not override explicit node CACHE_DIR, got: " << nodeCacheDir; + } + + // Restore the global cache_dir so the singleton does not leak into other tests. + char* reset_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080"}; + ovms::Config::instance().parse(7, reset_argv); +} +TEST_F(LLMOptionsHttpTest, LLMNodeOptionsCacheDirPropagation) { + LLMNodeOptionsCacheDirPropagation(modelsPath); +} +TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCacheDirPropagation) { + LLMNodeOptionsCacheDirPropagation(modelsPath); +} + void LLMNodeOptionsCheckNonDefault(std::string& modelsPath) { std::string testPbtxt = R"( input_stream: "HTTP_REQUEST_PAYLOAD:input"