From c45573c49fba98ff8e8a16fc95b3fb5be9e38445 Mon Sep 17 00:00:00 2001
From: exzile <joeypongallo@gmail.com>
Date: Fri, 26 Jun 2026 14:15:38 -0400
Subject: [PATCH] Propagate global --cache_dir to continuous batching pipeline

The continuous batching servable initializer constructs the GenAI
ContinuousBatchingPipeline directly and never applied the server-level
--cache_dir (ServerSettings.cacheDir). Unlike the non-CB path, which
applies it via ModelInstance::setCacheOptions, the CB path left model
compilation caching disabled unless the user duplicated the value into
the node's plugin_config as CACHE_DIR. As a result, .blob/.cl_cache
artifacts were never persisted and every restart fully recompiled the
model.

Inject the global cache_dir into the pipeline plugin config before
constructing the pipeline. An explicit CACHE_DIR in the node's
plugin_config remains authoritative.

Adds a regression test (LLMNodeOptionsCacheDirPropagation) covering both
propagation of the global value and precedence of an explicit node value.

Fixes #4230

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/model_cache.md                           |   2 +
 .../servable_initializer.cpp                  |  15 +++
 src/test/llm/llmnode_test.cpp                 | 120 ++++++++++++++++++
 3 files changed, 137 insertions(+)

diff --git a/docs/model_cache.md b/docs/model_cache.md
index 46373ae113..2174ca7f26 100644
--- a/docs/model_cache.md
+++ b/docs/model_cache.md
@@ -23,6 +23,8 @@ Alternatively the location of the cache storage can be set using the parameter `
 
 The model server security context must have read-write access to the cache storage path.
 
+`--cache_dir` also applies to LLM text-generation servables using the continuous batching pipeline (GPU). With it set, the compiled-model/blob cache is persisted across restarts, so a model that has already been compiled (or idle-unloaded) reloads from the cache instead of recompiling. An explicit `CACHE_DIR` in a node's `plugin_config` takes precedence over the global `--cache_dir`.
+
 When using Model Server with configuration file, it is possible to serve more than one model. In such case, model cache is applied to all the models, with an exception to:
 - Models with custom loader (for security reasons explained earlier)
 - Models configured to shape `auto` or batch_size `auto`
diff --git a/src/llm/language_model/continuous_batching/servable_initializer.cpp b/src/llm/language_model/continuous_batching/servable_initializer.cpp
index 1aaff99844..c59c94bf71 100644
--- a/src/llm/language_model/continuous_batching/servable_initializer.cpp
+++ b/src/llm/language_model/continuous_batching/servable_initializer.cpp
@@ -227,6 +227,21 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
         return status;
     }
 
+    // Propagate the global --cache_dir (ServerSettings) into the continuous batching
+    // pipeline plugin config. Unlike the non-CB path (ModelInstance::setCacheOptions),
+    // this initializer constructs the pipeline directly, so the server-level cache_dir
+    // is otherwise never applied. An explicit CACHE_DIR in the node's plugin_config
+    // remains authoritative.
+    const std::string& globalCacheDir = Config::instance().cacheDir();
+    if (!globalCacheDir.empty()) {
+        if (properties->pluginConfig.find(ov::cache_dir.name()) == properties->pluginConfig.end()) {
+            properties->pluginConfig[ov::cache_dir.name()] = globalCacheDir;
+            SPDLOG_DEBUG("Applying global cache_dir to continuous batching pipeline: {}", globalCacheDir);
+        } else {
+            SPDLOG_DEBUG("CACHE_DIR set explicitly in node plugin_config; keeping user value over global cache_dir");
+        }
+    }
+
     if (properties->device == "CPU") {
         status = applyDefaultCpuProperties(properties->pluginConfig);
         if (!status.ok()) {
diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp
index e13cf29919..5f8b764e84 100644
--- a/src/test/llm/llmnode_test.cpp
+++ b/src/test/llm/llmnode_test.cpp
@@ -37,6 +37,7 @@
 #endif
 
 #include "../../http_rest_api_handler.hpp"
+#include "../../config.hpp"
 #include "../../http_status_code.hpp"
 #include "../../json_parser.hpp"
 #include "../../llm/apis/openai_completions.hpp"
@@ -4425,6 +4426,125 @@ TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCheckPluginConfig) {
     LLMNodeOptionsCheckPluginConfig(modelsPath);
 }
 
+// Verifies that the global --cache_dir (ServerSettings) is propagated into the
+// continuous batching pipeline plugin config, and that an explicit CACHE_DIR in
+// the node's plugin_config takes precedence over the global value.
+// Regression test for openvinotoolkit/model_server#4230.
+void LLMNodeOptionsCacheDirPropagation(std::string& modelsPath) {
+    // Seed the global cache_dir via the CLI parser (same path used in production).
+    char* n_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080", (char*)"--cache_dir", (char*)"/tmp/ovms_global_cache"};
+    int arg_count = 9;
+    ovms::Config::instance().parse(arg_count, n_argv);
+    ASSERT_EQ(ovms::Config::instance().cacheDir(), "/tmp/ovms_global_cache");
+
+    // Case 1: no CACHE_DIR in node plugin_config -> global value is applied.
+    {
+        std::string testPbtxt = R"(
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+
+            node: {
+            name: "llmNode"
+            calculator: "HttpLLMCalculator"
+            input_stream: "LOOPBACK:loopback"
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            input_side_packet: "LLM_NODE_RESOURCES:llm"
+            output_stream: "LOOPBACK:loopback"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+            input_stream_info: {
+                tag_index: 'LOOPBACK:0',
+                back_edge: true
+            }
+            node_options: {
+                [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
+                    models_path: ")" +
+                                modelsPath + R"("
+                }
+            }
+            input_stream_handler {
+                input_stream_handler: "SyncSetInputStreamHandler",
+                options {
+                [mediapipe.SyncSetInputStreamHandlerOptions.ext] {
+                    sync_set {
+                    tag_index: "LOOPBACK:0"
+                    }
+                }
+                }
+            }
+            }
+        )";
+        adjustConfigForTargetPlatform(testPbtxt);
+        ::mediapipe::CalculatorGraphConfig config;
+        ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config));
+        std::shared_ptr<GenAiServable> servable;
+        ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK);
+        auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
+        ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1);
+        ASSERT_EQ(properties->pluginConfig["CACHE_DIR"].as<std::string>(), "/tmp/ovms_global_cache");
+    }
+
+    // Case 2: explicit CACHE_DIR in node plugin_config wins over the global value.
+    {
+        std::string testPbtxt = R"(
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+
+            node: {
+            name: "llmNode"
+            calculator: "HttpLLMCalculator"
+            input_stream: "LOOPBACK:loopback"
+            input_stream: "HTTP_REQUEST_PAYLOAD:input"
+            input_side_packet: "LLM_NODE_RESOURCES:llm"
+            output_stream: "LOOPBACK:loopback"
+            output_stream: "HTTP_RESPONSE_PAYLOAD:output"
+            input_stream_info: {
+                tag_index: 'LOOPBACK:0',
+                back_edge: true
+            }
+            node_options: {
+                [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
+                    models_path: ")" +
+                                modelsPath + R"("
+                    plugin_config: '{"CACHE_DIR": "/tmp/ovms_node_cache"}'
+                }
+            }
+            input_stream_handler {
+                input_stream_handler: "SyncSetInputStreamHandler",
+                options {
+                [mediapipe.SyncSetInputStreamHandlerOptions.ext] {
+                    sync_set {
+                    tag_index: "LOOPBACK:0"
+                    }
+                }
+                }
+            }
+            }
+        )";
+        adjustConfigForTargetPlatform(testPbtxt);
+        ::mediapipe::CalculatorGraphConfig config;
+        ASSERT_TRUE(::google::protobuf::TextFormat::ParseFromString(testPbtxt, &config));
+        std::shared_ptr<GenAiServable> servable;
+        ASSERT_EQ(initializeGenAiServable(servable, config.node(0), ""), StatusCode::OK);
+        auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
+        ASSERT_EQ(properties->pluginConfig.count("CACHE_DIR"), 1);
+        // The test harness may rewrite the path for the target platform, so match
+        // on substrings: the explicit node value must win over the global one.
+        std::string nodeCacheDir = properties->pluginConfig["CACHE_DIR"].as<std::string>();
+        ASSERT_NE(nodeCacheDir.find("ovms_node_cache"), std::string::npos) << "Explicit node CACHE_DIR should be used, got: " << nodeCacheDir;
+        ASSERT_EQ(nodeCacheDir.find("ovms_global_cache"), std::string::npos) << "Global cache_dir must not override explicit node CACHE_DIR, got: " << nodeCacheDir;
+    }
+
+    // Restore the global cache_dir so the singleton does not leak into other tests.
+    char* reset_argv[] = {(char*)"ovms", (char*)"--model_path", (char*)"/path/to/model", (char*)"--model_name", (char*)"some_name", (char*)"--rest_port", (char*)"8080"};
+    ovms::Config::instance().parse(7, reset_argv);
+}
+TEST_F(LLMOptionsHttpTest, LLMNodeOptionsCacheDirPropagation) {
+    LLMNodeOptionsCacheDirPropagation(modelsPath);
+}
+TEST_F(LLMVLMOptionsHttpTest, LLMVLMNodeOptionsCacheDirPropagation) {
+    LLMNodeOptionsCacheDirPropagation(modelsPath);
+}
+
 void LLMNodeOptionsCheckNonDefault(std::string& modelsPath) {
     std::string testPbtxt = R"(
         input_stream: "HTTP_REQUEST_PAYLOAD:input"