openvinotoolkit · exzile · Jun 26, 2026
diff --git a/docs/model_cache.md b/docs/model_cache.md
@@ -23,6 +23,8 @@ Alternatively the location of the cache storage can be set using the parameter `
 
 The model server security context must have read-write access to the cache storage path.
 
+`--cache_dir` also applies to LLM text-generation servables using the continuous batching pipeline (GPU). With it set, the compiled-model/blob cache is persisted across restarts, so a model that has already been compiled (or idle-unloaded) reloads from the cache instead of recompiling. An explicit `CACHE_DIR` in a node's `plugin_config` takes precedence over the global `--cache_dir`.
+
 When using Model Server with configuration file, it is possible to serve more than one model. In such case, model cache is applied to all the models, with an exception to:
 - Models with custom loader (for security reasons explained earlier)
 - Models configured to shape `auto` or batch_size `auto`

diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp
@@ -227,6 +227,21 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         return status;
     }
 
+    // Propagate the global --cache_dir (ServerSettings) into the continuous batching
+    // pipeline plugin config. Unlike the non-CB path (ModelInstance::setCacheOptions),
+    // this initializer constructs the pipeline directly, so the server-level cache_dir
+    // is otherwise never applied. An explicit CACHE_DIR in the node's plugin_config
+    // remains authoritative.
+    const std::string& globalCacheDir = Config::instance().cacheDir();
+    if (!globalCacheDir.empty()) {
+        if (properties->pluginConfig.find(ov::cache_dir.name()) == properties->pluginConfig.end()) {
+            properties->pluginConfig[ov::cache_dir.name()] = globalCacheDir;
+            SPDLOG_DEBUG("Applying global cache_dir to continuous batching pipeline: {}", globalCacheDir);
+        } else {
+            SPDLOG_DEBUG("CACHE_DIR set explicitly in node plugin_config; keeping user value over global cache_dir");
+        }
+    }
+
     if (properties->device == "CPU") {
         status = applyDefaultCpuProperties(properties->pluginConfig);
         if (!status.ok()) {

diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp
@@ -37,6 +37,7 @@
 #endif
 
 #include "../../http_rest_api_handler.hpp"
+#include "../../config.hpp"
 #include "../../http_status_code.hpp"
 #include "../../json_parser.hpp"
 #include "../../llm/apis/openai_completions.hpp"
@@ -4425,6 +4426,125 @@ TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCheckPluginConfig) {
     LLMNodeOptionsCheckPluginConfig(modelsPath);
 }
 
+// Verifies that the global --cache_dir (ServerSettings) is propagated into the
+// continuous batching pipeline plugin config, and that an explicit CACHE_DIR in
+// the node's plugin_config takes precedence over the global value.
+// Regression test for openvinotoolkit/model_server#4230.
+void LLMNodeOptionsCacheDirPropagation(std::string& modelsPath) {
+    // Seed the global cache_dir via the CLI parser (same path used in production).
+    char* n_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080", (char*)"--cache_dir", (char*)"/tmp/ovms_global_cache"};
+    int arg_count = 9;
+    ovms::Config::instance().parse(arg_count, n_argv);
+    ASSERT_EQ(ovms::Config::instance().cacheDir(), "/tmp/ovms_global_cache");
+
+    // Case 1: no CACHE_DIR in node plugin_config -> global value is applied.
+    {
+        std::string testPbtxt = R"(
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+
+            node: {
+            name: "llmNode"
+            calculator: "HttpLLMCalculator"
+            input_stream: "LOOPBACK:loopback"
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            input_side_packet: "LLM_NODE_RESOURCES:llm"
+            output_stream: "LOOPBACK:loopback"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+            input_stream_info: {
+                tag_index: 'LOOPBACK:0',
+                back_edge: true
+            }
+            node_options: {
+                [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
+                    models_path: ")" +
+                                modelsPath + R"("
+                }
+            }
+            input_stream_handler {
+                input_stream_handler: "SyncSetInputStreamHandler",
+                options {
+                [mediapipe.SyncSetInputStreamHandlerOptions.ext] {
+                    sync_set {
+                    tag_index: "LOOPBACK:0"
+                    }
+                }
+                }
+            }
+            }
+        )";
+        adjustConfigForTargetPlatform(testPbtxt);
+        ::mediapipe::CalculatorGraphConfig config;
+        ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config));
+        std::shared_ptr<GenAiServable> servable;
+        ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK);
+        auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
+        ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1);
+        ASSERT_EQ(properties->pluginConfig["CACHE_DIR"].as<std::string>(), "/tmp/ovms_global_cache");
+    }
+
+    // Case 2: explicit CACHE_DIR in node plugin_config wins over the global value.
+    {
+        std::string testPbtxt = R"(
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+
+            node: {
+            name: "llmNode"
+            calculator: "HttpLLMCalculator"
+            input_stream: "LOOPBACK:loopback"
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            input_side_packet: "LLM_NODE_RESOURCES:llm"
+            output_stream: "LOOPBACK:loopback"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+            input_stream_info: {
+                tag_index: 'LOOPBACK:0',
+                back_edge: true
+            }
+            node_options: {
+                [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
+                    models_path: ")" +
+                                modelsPath + R"("
+                    plugin_config: '{"CACHE_DIR": "/tmp/ovms_node_cache"}'
+                }
+            }
+            input_stream_handler {
+                input_stream_handler: "SyncSetInputStreamHandler",
+                options {
+                [mediapipe.SyncSetInputStreamHandlerOptions.ext] {
+                    sync_set {
+                    tag_index: "LOOPBACK:0"
+                    }
+                }
+                }
+            }
+            }
+        )";
+        adjustConfigForTargetPlatform(testPbtxt);
+        ::mediapipe::CalculatorGraphConfig config;
+        ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config));
+        std::shared_ptr<GenAiServable> servable;
+        ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK);
+        auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
+        ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1);
+        // The test harness may rewrite the path for the target platform, so match
+        // on substrings: the explicit node value must win over the global one.
+        std::string nodeCacheDir = properties->pluginConfig["CACHE_DIR"].as<std::string>();
+        ASSERT_NE(nodeCacheDir.find("ovms_node_cache"), std::string::npos) << "Explicit node CACHE_DIR should be used, got: " << nodeCacheDir;
+        ASSERT_EQ(nodeCacheDir.find("ovms_global_cache"), std::string::npos) << "Global cache_dir must not override explicit node CACHE_DIR, got: " << nodeCacheDir;
+    }
+
+    // Restore the global cache_dir so the singleton does not leak into other tests.
+    char* reset_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080"};
+    ovms::Config::instance().parse(7, reset_argv);
+}
+TEST_F(LLMOptionsHttpTest, LLMNodeOptionsCacheDirPropagation) {
+    LLMNodeOptionsCacheDirPropagation(modelsPath);
+}
+TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCacheDirPropagation) {
+    LLMNodeOptionsCacheDirPropagation(modelsPath);
+}
+
 void LLMNodeOptionsCheckNonDefault(std::string& modelsPath) {
     std::string testPbtxt = R"(
         input_stream: "HTTP_REQUEST_PAYLOAD:input"