feat: add page-aligned tensor creator for host KV cache.

Kang-Meng · Kang-Meng · commit 2a32a610d268 · 2025-11-17T15:34:47.000+08:00
diff --git a/third_party/dependencies.sh b/third_party/dependencies.sh
@@ -59,8 +59,8 @@ if [ -d "yalantinglibs" ]; then
 fi
 
 # Clone yalantinglibs
-echo "Cloning yalantinglibs from https://github.com/alibaba/yalantinglibs.git"
-git clone https://github.com/alibaba/yalantinglibs.git
+echo "Cloning yalantinglibs from https://gitcode.com/gh_mirrors/ya/yalantinglibs.git"
+git clone https://gitcode.com/gh_mirrors/ya/yalantinglibs.git
 check_success "Failed to clone yalantinglibs"
 
 # Build and install yalantinglibs
diff --git a/xllm/core/distributed_runtime/comm_channel.cpp b/xllm/core/distributed_runtime/comm_channel.cpp
@@ -312,8 +312,9 @@ void CommChannel::transfer_kv_blocks(
     const std::vector<BlockTransferInfo>& block_transfer_info,
     folly::Promise<uint32_t>& promise) {
   proto::BlockTransferInfos pb_block_transfer_info;
-  if (!block_transfer_info_to_proto(
-          0x0, block_transfer_info, &pb_block_transfer_info)) {
+  if (!block_transfer_info_to_proto(block_transfer_info,
+                                    &pb_block_transfer_info)) {
+    LOG(ERROR) << "transfer_kv_blocks fail: create proto fail!";
     promise.setValue(0);
     return;
   }
@@ -330,6 +331,8 @@ void CommChannel::transfer_kv_blocks(
   proto::BlockTransferInfos pb_block_transfer_info;
   if (!block_transfer_info_to_proto(
           batch_id, block_transfer_info, &pb_block_transfer_info)) {
+    LOG(ERROR) << "transfer_kv_blocks with batch id " << batch_id
+               << " fail: create proto fail!";
     return;
   }
   brpc::Controller cntl;
@@ -351,11 +354,7 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {
 
   ~ClientStreamReceiver() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
@@ -400,8 +399,9 @@ void CommChannel::prefetch_from_storage(
     const std::vector<BlockTransferInfo>& block_transfer_info,
     std::shared_ptr<std::atomic<uint32_t>>& success_cnt) {
   proto::BlockTransferInfos pb_block_transfer_info;
-  if (!block_transfer_info_to_proto(
-          0x0, block_transfer_info, &pb_block_transfer_info)) {
+  if (!block_transfer_info_to_proto(block_transfer_info,
+                                    &pb_block_transfer_info)) {
+    LOG(ERROR) << "prefetch_from_storage fail: create proto fail!";
     return;
   }
   ClientStreamReceiver receiver(flag, success_cnt);
@@ -420,6 +420,7 @@ void CommChannel::prefetch_from_storage(
 
   if (cntl.Failed()) {
     LOG(ERROR) << "Fail to connect stream, " << cntl.ErrorText();
+    return;
   }
 
   receiver.get_close_future().wait();
diff --git a/xllm/core/distributed_runtime/worker_service.cpp b/xllm/core/distributed_runtime/worker_service.cpp
@@ -425,7 +425,7 @@ void WorkerService::TransferBlocks(
   std::vector<BlockTransferInfo> block_transfer_info;
   uint64_t batch_id = proto_to_block_transfer_info(*req, block_transfer_info);
 
-  if (batch_id == 0x0) {
+  if (batch_id == UNINITIALIZED_BATCH_ID) {
     resp->set_success_cnt(worker_->transfer_kv_blocks(block_transfer_info));
   } else {
     worker_->transfer_kv_blocks(batch_id, std::move(block_transfer_info));
@@ -441,11 +441,7 @@ class ServerStreamHandler : public brpc::StreamInputHandler {
  public:
   ~ServerStreamHandler() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
diff --git a/xllm/core/framework/batch/batch.h b/xllm/core/framework/batch/batch.h
@@ -34,6 +34,9 @@ namespace xllm {
 
 struct ModelArgs;
 
+static uint64_t batch_counter_ = 1;
+constexpr uint64_t UNINITIALIZED_BATCH_ID = 0x0;
+
 class Batch {
  public:
   Batch() = default;
@@ -56,8 +59,12 @@ class Batch {
   }
 
   void set_batch_id() {
-    if (batch_id_ == 0x0) {
-      batch_id_ = absl::ToUnixMicros(absl::Now());
+    if (batch_id_ == UNINITIALIZED_BATCH_ID) {
+      batch_id_ = batch_counter_;
+      batch_counter_++;
+      if (batch_counter_ == UINT64_MAX) {
+        batch_counter_ = 1;
+      }
     }
   }
 
@@ -138,7 +145,7 @@ class Batch {
   // all sequences in this batch are in prefill stage
   bool all_seqs_in_prefill_ = false;
 
-  uint64_t batch_id_ = 0x0;
+  uint64_t batch_id_ = UNINITIALIZED_BATCH_ID;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/framework/batch/batch_input_builder.h b/xllm/core/framework/batch/batch_input_builder.h
@@ -159,7 +159,7 @@ class BatchInputBuilder {
 
   // thread pool for multithreaded processing, not owned
   ThreadPool* thread_pool_ = nullptr;
-  uint64_t batch_id_ = 0x0;
+  uint64_t batch_id_;
 };
 
 }  // namespace xllm
diff --git a/xllm/core/framework/block/block_manager_impl.cpp b/xllm/core/framework/block/block_manager_impl.cpp
@@ -70,9 +70,9 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
     for (const auto& block : blocks) {
       // the block is not shared by other sequence
       if (block.is_valid() && block.ref_count() <= 2) {
-        if (num_used_blocks_ > 0) {
-          num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);
-        } else {
+        auto origin_num_used_blocks =
+            num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);
+        if (origin_num_used_blocks < 0) {
           LOG(ERROR) << "num_used_blocks_==0 cannot fetch_sub for id:"
                      << block.id()
                      << ", total block size: " << num_total_blocks();
@@ -84,7 +84,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
               error_msg.append(std::to_string(id)).append(" ");
             }
           }
-          LOG(ERROR) << error_msg;
+          LOG(FATAL) << error_msg;
         }
       }
     }
diff --git a/xllm/core/framework/block/block_manager_pool.cpp b/xllm/core/framework/block/block_manager_pool.cpp
@@ -156,13 +156,9 @@ void BlockManagerPool::set_offload_callback(
                     device_block_mgr_ptr = block_managers_[i].get()](
                        std::vector<folly::Try<uint32_t>>&& results) {
           for (auto&& result : results) {
-            try {
-              if (result.value() != host_blocks.size()) {
-                LOG(FATAL) << "Offload copy fail, expected "
-                           << host_blocks.size() << ", got " << result.value();
-              }
-            } catch (const std::exception& e) {
-              LOG(FATAL) << "Offload copy fail! Exception caught: " << e.what();
+            if (result.value() != host_blocks.size()) {
+              LOG(FATAL) << "Offload copy fail, expected " << host_blocks.size()
+                         << ", got " << result.value();
             }
           }
           host_block_mgr_ptr->cache(host_blocks);
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.cpp b/xllm/core/framework/kv_cache/kv_cache_store.cpp
@@ -55,30 +55,18 @@ bool KVCacheStore::init(const StoreConfig& config,
   LOG(INFO) << "v_cache_size_per_block: " << v_cache_size_per_block_;
 
   if (config_.protocol == "rdma") {
-    for (int block = 0; block < host_kv_caches_->size(); block++) {
-      void* key_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_k_cache().data_ptr());
-
-      auto register_k_result = client_ptr_->RegisterLocalMemory(
-          key_cache, k_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_k_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for key cache: "
-                   << toString(register_k_result.error());
-        return false;
-      }
-
-      void* value_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_v_cache().data_ptr());
-
-      auto register_v_result = client_ptr_->RegisterLocalMemory(
-          value_cache, v_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_v_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for value cache: "
-                   << toString(register_v_result.error());
+    if (config_.total_size > 0 && config_.tensor_data != nullptr) {
+      auto result = client_ptr_->RegisterLocalMemory(
+          config_.tensor_data, config_.total_size, "cpu:0", false, false);
+      if (!result.has_value()) {
+        LOG(ERROR) << "Failed to register local memory: "
+                   << toString(result.error());
         return false;
       }
+    } else {
+      LOG(FATAL) << "rdma must RegisterLocalMemory, but got register size: "
+                 << config_.total_size
+                 << ", and data ptr: " << uint64_t(config_.tensor_data);
     }
   }
   is_initialized_ = true;
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.h b/xllm/core/framework/kv_cache/kv_cache_store.h
@@ -19,6 +19,8 @@ struct StoreConfig {
   std::string master_server_address = "";
   int replica_num = 1;
   uint32_t tp_rank = 0;
+  size_t total_size = 0;
+  void* tensor_data = nullptr;
 };
 
 class KVCacheStore {
diff --git a/xllm/core/framework/model/model_input_params.h b/xllm/core/framework/model/model_input_params.h
@@ -27,7 +27,11 @@ limitations under the License.
 
 namespace xllm {
 
-enum class TransferType : uint8_t { G2H = 0, H2D = 1, D2G = 2 };
+enum class TransferType : uint8_t {
+  G2H = 0,  // global memory(KVCache store) to host memory(DRAM)
+  H2D = 1,  // host memory(DRAM) to device memory(HBM)
+  D2G = 2   // host memory(DRAM) to global memory(KVCache store)
+};
 
 struct BlockTransferInfo {
   int32_t src_block_id = -1;
diff --git a/xllm/core/framework/prefix_cache/prefix_cache.cpp b/xllm/core/framework/prefix_cache/prefix_cache.cpp
@@ -124,11 +124,6 @@ size_t PrefixCache::insert(const Slice<int32_t>& token_ids,
   return insert(token_ids, blocks, &insert_keys);
 }
 
-size_t PrefixCache::insert(const std::vector<Block>& blocks) {
-  std::vector<Murmur3Key> insert_keys;
-  return insert(blocks, &insert_keys);
-}
-
 size_t PrefixCache::evict(size_t n_blocks) {
   std::vector<Murmur3Key> evict_keys;
   return evict(n_blocks, &evict_keys);
@@ -197,13 +192,11 @@ size_t PrefixCache::insert(const Slice<int32_t>& token_ids,
   return n_tokens;
 }
 
-size_t PrefixCache::insert(const std::vector<Block>& blocks,
-                           std::vector<Murmur3Key>* insert_keys) {
+size_t PrefixCache::insert(const std::vector<Block>& blocks) {
   const int64_t now = absl::ToUnixMicros(absl::Now());
   DNodeList node_list;
   Murmur3Key token_hash_key;
 
-  insert_keys->reserve(blocks.size());
   for (size_t i = 0; i < blocks.size(); i++) {
     if (!blocks[i].is_valid()) {
       continue;
@@ -227,8 +220,6 @@ size_t PrefixCache::insert(const std::vector<Block>& blocks,
       cached_blocks_.emplace(std::make_pair(token_hash_key, new_node));
 
       num_blocks_++;
-
-      insert_keys->emplace_back(token_hash_key.data);
     }
   }
 
diff --git a/xllm/core/framework/prefix_cache/prefix_cache.h b/xllm/core/framework/prefix_cache/prefix_cache.h
@@ -101,8 +101,6 @@ class PrefixCache {
   size_t insert(const Slice<int32_t>& token_ids,
                 std::vector<Block>& blocks,
                 std::vector<Murmur3Key>* insert_keys);
-  size_t insert(const std::vector<Block>& blocks,
-                std::vector<Murmur3Key>* insert_keys);
   size_t evict(size_t n_blocks, std::vector<Murmur3Key>* evict_keys);
 
   struct Node {
diff --git a/xllm/core/platform/npu/npu_layer_synchronizer.cpp b/xllm/core/platform/npu/npu_layer_synchronizer.cpp
@@ -27,7 +27,7 @@ NPULayerSynchronizerImpl::NPULayerSynchronizerImpl(const int64_t num_layers,
   uint32_t flags = ACL_EVENT_SYNC;
   for (int64_t i = 0; i < num_layers; ++i) {
     auto ret = aclrtCreateEventWithFlag(&events_[i], flags);
-    CHECK(ret == ACL_SUCCESS) << "Create event failed.";
+    CHECK(ret == ACL_SUCCESS) << "Create event failed:" << ret;
   }
 }
 
diff --git a/xllm/core/runtime/params_utils.cpp b/xllm/core/runtime/params_utils.cpp
@@ -729,7 +729,6 @@ uint64_t proto_to_block_transfer_info(
 }
 
 bool block_transfer_info_to_proto(
-    const uint64_t batch_id,
     const std::vector<BlockTransferInfo>& block_transfer_info,
     proto::BlockTransferInfos* pb_block_transfer_info) {
   pb_block_transfer_info->mutable_transfer_infos()->Reserve(
@@ -754,12 +753,24 @@ bool block_transfer_info_to_proto(
     pb_cache.set_dst_block_id(info.dst_block_id);
     pb_cache.set_hash_key(info.hash_key, MURMUR_HASH3_VALUE_LEN);
 
-    *pb_block_transfer_info->mutable_transfer_infos()->Add() = pb_cache;
+    *pb_block_transfer_info->mutable_transfer_infos()->Add() =
+        std::move(pb_cache);
   }
-  pb_block_transfer_info->set_batch_id(batch_id);
   pb_block_transfer_info->set_transfer_type(proto::TransferType(transfer_type));
 
   return true;
 }
 
+bool block_transfer_info_to_proto(
+    const uint64_t batch_id,
+    const std::vector<BlockTransferInfo>& block_transfer_info,
+    proto::BlockTransferInfos* pb_block_transfer_info) {
+  if (!block_transfer_info_to_proto(block_transfer_info,
+                                    pb_block_transfer_info)) {
+    return false;
+  }
+  pb_block_transfer_info->set_batch_id(batch_id);
+  return true;
+}
+
 }  // namespace xllm
diff --git a/xllm/core/runtime/params_utils.h b/xllm/core/runtime/params_utils.h
@@ -56,6 +56,10 @@ uint64_t proto_to_block_transfer_info(
     const proto::BlockTransferInfos& pb_block_transfer_info,
     std::vector<BlockTransferInfo>& block_transfer_info);
 
+bool block_transfer_info_to_proto(
+    const std::vector<BlockTransferInfo>& block_transfer_info,
+    proto::BlockTransferInfos* pb_block_transfer_info);
+
 bool block_transfer_info_to_proto(
     const uint64_t batch_id,
     const std::vector<BlockTransferInfo>& block_transfer_info,
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h
diff --git a/xllm/models/llm/glm4_moe_mtp.h b/xllm/models/llm/glm4_moe_mtp.h
diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h

Original file line number	Diff line number	Diff line change
`@@ -70,9 +70,9 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {`
`70`	`70`	`for (const auto& block : blocks) {`
`71`	`71`	`// the block is not shared by other sequence`
`72`	`72`	`if (block.is_valid() && block.ref_count() <= 2) {`
`73`		`- if (num_used_blocks_ > 0) {`
`74`		`- num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);`
`75`		`- } else {`
	`73`	`+ auto origin_num_used_blocks =`
	`74`	`+ num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);`
	`75`	`+ if (origin_num_used_blocks < 0) {`
`76`	`76`	`LOG(ERROR) << "num_used_blocks_==0 cannot fetch_sub for id:"`
`77`	`77`	`<< block.id()`
`78`	`78`	`<< ", total block size: " << num_total_blocks();`
`@@ -84,7 +84,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {`
`84`	`84`	`error_msg.append(std::to_string(id)).append(" ");`
`85`	`85`	`}`
`86`	`86`	`}`
`87`		`- LOG(ERROR) << error_msg;`
	`87`	`+ LOG(FATAL) << error_msg;`
`88`	`88`	`}`
`89`	`89`	`}`
`90`	`90`	`}`