[BugFix] [PD Disaggregation] fix v1 scheduler prefill node profile run & ipc transfer protocol (#5132)

liyonghua0910 · web-flow · commit 43097a512ad4 · 2025-11-20T21:39:22.000+08:00
* [fix] fix v1 scheduler profile run for append attention in prefill node

* [fix] skip send_signal if kv signal not inited for gpu and xpu

* [fix] extend fix to flash_attn &amp; mla_attn

* [fix] fix v1 pd run in ipc transfer protocol

* [ci] add test for v1 pd profile run using ipc transfer protocol

* [style] fix code style check

* [style] fix code style again

* [fix] fix profile run

* [update] remove --num-gpu-blocks-override in example script

* [chore] rename forward_meta is_profiling to is_dummy_or_profile_run
diff --git a/custom_ops/gpu_ops/remote_cache_kv_ipc.h b/custom_ops/gpu_ops/remote_cache_kv_ipc.h
@@ -18,88 +18,94 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/ipc.h>
 #include <sys/mman.h>
+#include <sys/msg.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <sys/ipc.h>
-#include <sys/msg.h>
 #include <unistd.h>
 
 #include "driver_types.h"
+#include "msg_utils.h"
 #include "paddle/extension.h"
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "msg_utils.h"
 
 struct RemoteCacheKvIpc {
-    struct save_cache_kv_complete_signal_layerwise_meta_data{
-        int32_t layer_id=-1;
-        void * shm_ptr=nullptr;
-        int shm_fd=-1;
-        save_cache_kv_complete_signal_layerwise_meta_data(){}
-        save_cache_kv_complete_signal_layerwise_meta_data(int32_t layer_id_,
-                                                            void* shm_ptr_,
-                                                            int shm_fd_)
-            :layer_id(layer_id_), shm_ptr(shm_ptr_), shm_fd(shm_fd_){
-        }
-    };
+  struct save_cache_kv_complete_signal_layerwise_meta_data {
+    int32_t layer_id = -1;
+    void* shm_ptr = nullptr;
+    int shm_fd = -1;
+    save_cache_kv_complete_signal_layerwise_meta_data() {}
+    save_cache_kv_complete_signal_layerwise_meta_data(int32_t layer_id_,
+                                                      void* shm_ptr_,
+                                                      int shm_fd_)
+        : layer_id(layer_id_), shm_ptr(shm_ptr_), shm_fd(shm_fd_) {}
+  };
 
-    struct save_cache_kv_complete_signal_layerwise_meta_data_per_query{
-        int layer_id_;
-        int num_layers_;
-        bool inited = false;
-        struct msgdatakv msg_sed;
-        int msgid;
+  struct save_cache_kv_complete_signal_layerwise_meta_data_per_query {
+    int layer_id_;
+    int num_layers_;
+    bool inited = false;
+    struct msgdatakv msg_sed;
+    int msgid;
 
-        save_cache_kv_complete_signal_layerwise_meta_data_per_query(){}
+    save_cache_kv_complete_signal_layerwise_meta_data_per_query() {}
 
-        void init(const int *seq_lens_encoder,
-                  const int *seq_lens_decoder,
-                  const int rank,
-                  const int num_layers,
-                  const int real_bsz) {
-            layer_id_ = 0;
-            num_layers_ = num_layers;
-            msg_sed.mtype = 1;
-            int encoder_count = 0;
-            for (int i = 0; i < real_bsz; i++) {
-                if (seq_lens_encoder[i] > 0) {
-                    msg_sed.mtext[3 * encoder_count + 2] = i;
-                    msg_sed.mtext[3 * encoder_count + 3] = seq_lens_decoder[i];
-                    msg_sed.mtext[3 * encoder_count + 4] = seq_lens_encoder[i];
-                    encoder_count++;
-                }
-            }
-            msg_sed.mtext[0] = encoder_count;
-
-            if (!inited) {
-                // just init once
-                const int msg_id = 1024 + rank;
-                key_t key = ftok("/opt/", msg_id);
-                msgid = msgget(key, IPC_CREAT | 0666);
-                inited = true;
-            }
+    void init(const int* seq_lens_encoder,
+              const int* seq_lens_decoder,
+              const int rank,
+              const int num_layers,
+              const int real_bsz) {
+      layer_id_ = 0;
+      num_layers_ = num_layers;
+      msg_sed.mtype = 1;
+      int encoder_count = 0;
+      for (int i = 0; i < real_bsz; i++) {
+        if (seq_lens_encoder[i] > 0) {
+          msg_sed.mtext[3 * encoder_count + 2] = i;
+          msg_sed.mtext[3 * encoder_count + 3] = seq_lens_decoder[i];
+          msg_sed.mtext[3 * encoder_count + 4] = seq_lens_encoder[i];
+          encoder_count++;
         }
+      }
+      msg_sed.mtext[0] = encoder_count;
+
+      if (!inited) {
+        // just init once
+        const int msg_id = 1024 + rank;
+        key_t key = ftok("/opt/", msg_id);
+        msgid = msgget(key, IPC_CREAT | 0666);
+        inited = true;
+      }
+    }
 
-        void CUDART_CB send_signal() {
-            msg_sed.mtext[1] = layer_id_;
-            if ((msgsnd(msgid, &msg_sed, (MAX_BSZ * 3 + 2) * 4, 0)) == -1) {
-                printf("kv signal full msg buffer\n");
-            }
-            layer_id_ = (layer_id_ + 1);
-            assert(layer_id_ <= num_layers_);
+    void CUDART_CB send_signal() {
+      if (inited) {
+        msg_sed.mtext[1] = layer_id_;
+        if ((msgsnd(msgid, &msg_sed, (MAX_BSZ * 3 + 2) * 4, 0)) == -1) {
+          printf("kv signal full msg buffer\n");
         }
-    };
+        layer_id_ = (layer_id_ + 1);
+        assert(layer_id_ <= num_layers_);
+      }
+    }
+  };
 
-    static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data kv_complete_signal_meta_data;
-    static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data_per_query kv_complete_signal_meta_data_per_query;
-    static void* kv_complete_signal_identity_ptr;
-    static bool kv_complete_signal_shmem_opened;
+  static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data
+      kv_complete_signal_meta_data;
+  static RemoteCacheKvIpc::
+      save_cache_kv_complete_signal_layerwise_meta_data_per_query
+          kv_complete_signal_meta_data_per_query;
+  static void* kv_complete_signal_identity_ptr;
+  static bool kv_complete_signal_shmem_opened;
 
-    static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data open_shm_and_get_complete_signal_meta_data(
-        const int rank_id,
-        const int device_id,
-        const bool keep_pd_step_flag);
-    static void CUDART_CB save_cache_kv_complete_signal_layerwise(void* meta_data);
-    static void CUDART_CB save_cache_kv_complete_signal_layerwise_per_query(void* meta_data);
+  static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data
+  open_shm_and_get_complete_signal_meta_data(const int rank_id,
+                                             const int device_id,
+                                             const bool keep_pd_step_flag);
+  static void CUDART_CB
+  save_cache_kv_complete_signal_layerwise(void* meta_data);
+  static void CUDART_CB
+  save_cache_kv_complete_signal_layerwise_per_query(void* meta_data);
 };
diff --git a/custom_ops/xpu_ops/src/ops/remote_cache_kv_ipc.h b/custom_ops/xpu_ops/src/ops/remote_cache_kv_ipc.h
@@ -72,12 +72,14 @@ struct RemoteCacheKvIpc {
     }
 
     void send_signal() {
-      msg_sed.mtext[1] = layer_id_;
-      if ((msgsnd(msgid, &msg_sed, (MAX_BSZ * 3 + 2) * 4, 0)) == -1) {
-        printf("kv signal full msg buffer\n");
+      if (inited) {
+        msg_sed.mtext[1] = layer_id_;
+        if ((msgsnd(msgid, &msg_sed, (MAX_BSZ * 3 + 2) * 4, 0)) == -1) {
+          printf("kv signal full msg buffer\n");
+        }
+        layer_id_ = (layer_id_ + 1);
+        assert(layer_id_ <= num_layers_);
       }
-      layer_id_ = (layer_id_ + 1);
-      assert(layer_id_ <= num_layers_);
     }
   };
 
diff --git a/examples/splitwise/start_v1_tp1.sh b/examples/splitwise/start_v1_tp1.sh
@@ -68,7 +68,6 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --cache-transfer-protocol "rdma" \
        --rdma-comm-ports "$((P_PORT + 4))" \
        --pd-comm-port "$((P_PORT + 5))" \
-       --num-gpu-blocks-override 2000 \
        --router "0.0.0.0:${ROUTER_PORT}" \
        2>&1 >${FD_LOG_DIR}/nohup &
 
diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
@@ -687,8 +687,8 @@ def prefill_layerwise_send_cache_thread(self):
                             for engine_idx, _ in batch_engine_signals:
                                 task = self.idx_cache_task_dict[engine_idx]
                                 if task["status"] == "finished" or ("error" in task["status"]):
-                                    target_id = int(task["rdma_ports"][self.rank])
                                     if task["transfer_protocol"] == "ipc":
+                                        target_id = int(task["device_ids"][self.rank])
                                         self.messager["ipc"].write_block_by_sync(target_id)
                                     self.engine_worker_queue.finish_send_cache_barrier.wait()
                                     self.engine_worker_queue.put_finished_req([[task["request_id"], task["status"]]])
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -517,18 +517,6 @@ def __post_init__(self):
                         f"The number of rdma comm ports must be equal to number of ranks ({self.data_parallel_size=} * {self.tensor_parallel_size=} = {self.data_parallel_size * self.tensor_parallel_size}), but got {len(self.rdma_comm_ports)}."
                     )
 
-            if envs.ENABLE_V1_KVCACHE_SCHEDULER == 1:
-                if "ipc" in self.cache_transfer_protocol:
-                    # FIXME: support ipc cache transfer protocol
-                    raise NotImplementedError(
-                        "only support rdma cache transfer protocol " "when using ENABLE_V1_KVCACHE_SCHEDULER."
-                    )
-                # FIXME: fix this bug
-                if self.splitwise_role == "prefill" and self.num_gpu_blocks_override is None:
-                    raise NotImplementedError(
-                        "please set num_gpu_blocks_override for prefill " "instance using ENABLE_V1_KVCACHE_SCHEDULER."
-                    )
-
         if not current_platform.is_cuda() and not current_platform.is_xpu():
             envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
         if self.guided_decoding_backend != "off":
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -1001,7 +1001,7 @@ def preallocate_resource_in_d(self, request: Request):
                 request.need_prefill_tokens + self.config.cache_config.block_size - 1
             ) // self.config.cache_config.block_size + self.config.cache_config.enc_dec_block_num  # consider for mtp, plus enc_dec_block_num
             if self.cache_manager.can_allocate_gpu_blocks(need_prealloc_prefill_blocks):
-                request.block_tables.extend(self.cache_manager.allocate_gpu_blocks(need_prealloc_prefill_blocks))
+                request.block_tables = self.cache_manager.allocate_gpu_blocks(need_prealloc_prefill_blocks)
                 request.num_computed_tokens = request.need_prefill_tokens
                 request.disaggregate_info["block_tables"] = request.block_tables
                 allocated_position = self.get_available_position()
diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py
@@ -140,6 +140,8 @@ class ForwardMeta:
     block_tables: Optional[paddle.Tensor] = None
     # KV caches
     caches: Optional[list[paddle.Tensor]] = None
+    # Flag of profile run
+    is_dummy_or_profile_run: bool = False
 
     def clear_caches(self):
         """Safely clean up the caches"""
diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -178,7 +178,7 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
         # pd_disaggregation
         metadata.kv_signal_data_list = [None] * self.num_layers
         if self.pd_disaggregation_mode == "per_chunk":
-            if not self.keep_pd_step_flag:
+            if not self.keep_pd_step_flag and not forward_meta.is_dummy_or_profile_run:
                 init_kv_signal_per_query(
                     forward_meta.seq_lens_encoder,
                     forward_meta.seq_lens_this_time,
diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -231,7 +231,7 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
         # pd_disaggregation
         metadata.kv_signal_data_list = [None] * self.num_layers
         if self.pd_disaggregation_mode == "per_chunk":
-            if not self.keep_pd_step_flag:
+            if not self.keep_pd_step_flag and not forward_meta.is_dummy_or_profile_run:
                 init_kv_signal_per_query(
                     forward_meta.seq_lens_encoder,
                     forward_meta.seq_lens_this_time,
diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py
@@ -214,7 +214,7 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
         # pd_disaggregation
         metadata.kv_signal_data_list = [None] * self.num_layers
         if self.pd_disaggregation_mode == "per_chunk":
-            if not self.keep_pd_step_flag:
+            if not self.keep_pd_step_flag and not forward_meta.is_dummy_or_profile_run:
                 init_kv_signal_per_query(
                     forward_meta.seq_lens_encoder,
                     forward_meta.seq_lens_this_time,
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -1229,7 +1229,7 @@ def _init_share_inputs(self, max_num_seqs: int):
 
         self.share_inputs["mask_rollback"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
-    def _prepare_inputs(self) -> None:
+    def _prepare_inputs(self, is_dummy_or_profile_run=False) -> None:
         """Prepare the model inputs"""
         if envs.ENABLE_V1_KVCACHE_SCHEDULER:
             recover_decode_task(
@@ -1280,7 +1280,7 @@ def _prepare_inputs(self) -> None:
         max_bad_tokens_len = np.max(self.share_inputs["bad_tokens_len"].numpy())
 
         # Initialize forward meta data
-        self.initialize_forward_meta()
+        self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)
 
         # Get sampling metadata
         self.sampling_metadata = SamplingMetadata(
@@ -1334,7 +1334,7 @@ def get_model(self) -> nn.Layer:
         """Get current model"""
         return self.model
 
-    def initialize_forward_meta(self):
+    def initialize_forward_meta(self, is_dummy_or_profile_run=False):
         """
         Initialize forward meta and attention meta data
         """
@@ -1386,6 +1386,9 @@ def initialize_forward_meta(self):
             only_prefill_use_cudagraph if self.cudagraph_only_prefill else only_decode_use_cudagraph
         )
 
+        # Set forward_meta.is_dummy_or_profile_run to True to skip init_kv_signal_per_query for attention backends
+        self.forward_meta.is_dummy_or_profile_run = is_dummy_or_profile_run
+
         # Initialzie attention meta data
         for attn_backend in self.attn_backends:
             attn_backend.init_attention_metadata(self.forward_meta)
@@ -1778,7 +1781,7 @@ def _dummy_run(
 
         while True:
             # 1. Initialize forward meta and attention meta data
-            self._prepare_inputs()
+            self._prepare_inputs(is_dummy_or_profile_run=True)
 
             # 2. Padding inputs for cuda graph
             self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph
diff --git a/tests/e2e/test_ernie_03b_pd_router_v1.py b/tests/e2e/test_ernie_03b_pd_router_v1.py

Original file line number	Diff line number	Diff line change
`@@ -72,12 +72,14 @@ struct RemoteCacheKvIpc {`
`72`	`72`	`}`
`73`	`73`
`74`	`74`	`void send_signal() {`
`75`		`- msg_sed.mtext[1] = layer_id_;`
`76`		`- if ((msgsnd(msgid, &msg_sed, (MAX_BSZ * 3 + 2) * 4, 0)) == -1) {`
`77`		`- printf("kv signal full msg buffer\n");`
	`75`	`+ if (inited) {`
	`76`	`+ msg_sed.mtext[1] = layer_id_;`
	`77`	`+ if ((msgsnd(msgid, &msg_sed, (MAX_BSZ * 3 + 2) * 4, 0)) == -1) {`
	`78`	`+ printf("kv signal full msg buffer\n");`
	`79`	`+ }`
	`80`	`+ layer_id_ = (layer_id_ + 1);`
	`81`	`+ assert(layer_id_ <= num_layers_);`
`78`	`82`	`}`
`79`		`- layer_id_ = (layer_id_ + 1);`
`80`		`- assert(layer_id_ <= num_layers_);`
`81`	`83`	`}`
`82`	`84`	`};`
`83`	`85`