feat: add page-aligned tensor creator for host KV cache.

Kang-Meng · Kang-Meng · commit 1c0f9a2f897d · 2025-11-12T16:08:48.000+08:00
diff --git a/third_party/dependencies.sh b/third_party/dependencies.sh
@@ -59,8 +59,8 @@ if [ -d "yalantinglibs" ]; then
 fi
 
 # Clone yalantinglibs
-echo "Cloning yalantinglibs from https://github.com/alibaba/yalantinglibs.git"
-git clone https://github.com/alibaba/yalantinglibs.git
+echo "Cloning yalantinglibs from https://gitcode.com/gh_mirrors/ya/yalantinglibs.git"
+git clone https://gitcode.com/gh_mirrors/ya/yalantinglibs.git
 check_success "Failed to clone yalantinglibs"
 
 # Build and install yalantinglibs
diff --git a/xllm/core/distributed_runtime/comm_channel.cpp b/xllm/core/distributed_runtime/comm_channel.cpp
@@ -351,11 +351,7 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {
 
   ~ClientStreamReceiver() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
diff --git a/xllm/core/distributed_runtime/worker_service.cpp b/xllm/core/distributed_runtime/worker_service.cpp
@@ -442,11 +442,7 @@ class ServerStreamHandler : public brpc::StreamInputHandler {
  public:
   ~ServerStreamHandler() {
     if (!promise_set_.exchange(true)) {
-      try {
-        close_promise_.set_value();
-      } catch (const std::exception& e) {
-        LOG(WARNING) << "Exception in destructor: " << e.what();
-      }
+      close_promise_.set_value();
     }
   }
 
diff --git a/xllm/core/framework/batch/batch.h b/xllm/core/framework/batch/batch.h
@@ -34,6 +34,7 @@ namespace xllm {
 
 struct ModelArgs;
 
+static uint64_t batch_counter_ = 1;
 class Batch {
  public:
   Batch() = default;
@@ -57,7 +58,11 @@ class Batch {
 
   void set_batch_id() {
     if (batch_id_ == 0x0) {
-      batch_id_ = absl::ToUnixMicros(absl::Now());
+      batch_id_ = batch_counter_;
+      batch_counter_++;
+      if (batch_counter_ == UINT64_MAX) {
+        batch_counter_ = 1;
+      }
     }
   }
 
diff --git a/xllm/core/framework/block/block_manager_impl.cpp b/xllm/core/framework/block/block_manager_impl.cpp
@@ -70,9 +70,9 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
     for (const auto& block : blocks) {
       // the block is not shared by other sequence
       if (block.is_valid() && block.ref_count() <= 2) {
-        if (num_used_blocks_ > 0) {
-          num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);
-        } else {
+        auto origin_num_used_blocks =
+            num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);
+        if (origin_num_used_blocks < 0) {
           LOG(ERROR) << "num_used_blocks_==0 cannot fetch_sub for id:"
                      << block.id()
                      << ", total block size: " << num_total_blocks();
@@ -84,7 +84,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {
               error_msg.append(std::to_string(id)).append(" ");
             }
           }
-          LOG(ERROR) << error_msg;
+          LOG(FATAL) << error_msg;
         }
       }
     }
diff --git a/xllm/core/framework/block/block_manager_pool.cpp b/xllm/core/framework/block/block_manager_pool.cpp
@@ -156,13 +156,9 @@ void BlockManagerPool::set_offload_callback(
                     device_block_mgr_ptr = block_managers_[i].get()](
                        std::vector<folly::Try<uint32_t>>&& results) {
           for (auto&& result : results) {
-            try {
-              if (result.value() != host_blocks.size()) {
-                LOG(FATAL) << "Offload copy fail, expected "
-                           << host_blocks.size() << ", got " << result.value();
-              }
-            } catch (const std::exception& e) {
-              LOG(FATAL) << "Offload copy fail! Exception caught: " << e.what();
+            if (result.value() != host_blocks.size()) {
+              LOG(FATAL) << "Offload copy fail, expected " << host_blocks.size()
+                         << ", got " << result.value();
             }
           }
           host_block_mgr_ptr->cache(host_blocks);
@@ -212,6 +208,8 @@ bool BlockManagerPool::allocate(Sequence* sequence, size_t num_tokens) {
     allocate_shared(sequence);
     if (sequence->host_kv_state().num_kv_blocks() == 0) {
       allocate_host_shared(sequence);
+      if (sequence->kv_state().shared_kv_blocks_num() <
+          sequence->host_kv_state().shared_kv_blocks_num())
     }
   }
 
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.cpp b/xllm/core/framework/kv_cache/kv_cache_store.cpp
@@ -55,30 +55,18 @@ bool KVCacheStore::init(const StoreConfig& config,
   LOG(INFO) << "v_cache_size_per_block: " << v_cache_size_per_block_;
 
   if (config_.protocol == "rdma") {
-    for (int block = 0; block < host_kv_caches_->size(); block++) {
-      void* key_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_k_cache().data_ptr());
-
-      auto register_k_result = client_ptr_->RegisterLocalMemory(
-          key_cache, k_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_k_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for key cache: "
-                   << toString(register_k_result.error());
-        return false;
-      }
-
-      void* value_cache = static_cast<char*>(
-          host_kv_caches_->at(block).get_v_cache().data_ptr());
-
-      auto register_v_result = client_ptr_->RegisterLocalMemory(
-          value_cache, v_cache_size_per_block_, "cpu:0", false, false);
-
-      if (!register_v_result.has_value()) {
-        LOG(ERROR) << "Failed to register local memory for value cache: "
-                   << toString(register_v_result.error());
+    if (config_.total_size > 0 && config_.tensor_data != nullptr) {
+      auto result = client_ptr_->RegisterLocalMemory(
+          config_.tensor_data, config_.total_size, "cpu:0", false, false);
+      if (!result.has_value()) {
+        LOG(ERROR) << "Failed to register local memory: "
+                   << toString(result.error());
         return false;
       }
+    } else {
+      LOG(FATAL) << "rdma must RegisterLocalMemory, but got register size: "
+                 << config_.total_size
+                 << ", and data ptr: " << uint64_t(config_.tensor_data);
     }
   }
   is_initialized_ = true;
diff --git a/xllm/core/framework/kv_cache/kv_cache_store.h b/xllm/core/framework/kv_cache/kv_cache_store.h
@@ -19,6 +19,8 @@ struct StoreConfig {
   std::string master_server_address = "";
   int replica_num = 1;
   uint32_t tp_rank = 0;
+  size_t total_size = 0;
+  void* tensor_data = nullptr;
 };
 
 class KVCacheStore {
diff --git a/xllm/core/framework/model/model_input_params.h b/xllm/core/framework/model/model_input_params.h
@@ -27,7 +27,11 @@ limitations under the License.
 
 namespace xllm {
 
-enum class TransferType : uint8_t { G2H = 0, H2D = 1, D2G = 2 };
+enum class TransferType : uint8_t {
+  G2H = 0,  // global memory(KVCache store) to host memory(DRAM)
+  H2D = 1,  // host memory(DRAM) to device memory(HBM)
+  D2G = 2   // host memory(DRAM) to global memory(KVCache store)
+};
 
 struct BlockTransferInfo {
   int32_t src_block_id = -1;
diff --git a/xllm/core/framework/prefix_cache/prefix_cache.cpp b/xllm/core/framework/prefix_cache/prefix_cache.cpp
@@ -124,11 +124,6 @@ size_t PrefixCache::insert(const Slice<int32_t>& token_ids,
   return insert(token_ids, blocks, &insert_keys);
 }
 
-size_t PrefixCache::insert(const std::vector<Block>& blocks) {
-  std::vector<Murmur3Key> insert_keys;
-  return insert(blocks, &insert_keys);
-}
-
 size_t PrefixCache::evict(size_t n_blocks) {
   std::vector<Murmur3Key> evict_keys;
   return evict(n_blocks, &evict_keys);
@@ -197,13 +192,11 @@ size_t PrefixCache::insert(const Slice<int32_t>& token_ids,
   return n_tokens;
 }
 
-size_t PrefixCache::insert(const std::vector<Block>& blocks,
-                           std::vector<Murmur3Key>* insert_keys) {
+size_t PrefixCache::insert(const std::vector<Block>& blocks) {
   const int64_t now = absl::ToUnixMicros(absl::Now());
   DNodeList node_list;
   Murmur3Key token_hash_key;
 
-  insert_keys->reserve(blocks.size());
   for (size_t i = 0; i < blocks.size(); i++) {
     if (!blocks[i].is_valid()) {
       continue;
@@ -227,8 +220,6 @@ size_t PrefixCache::insert(const std::vector<Block>& blocks,
       cached_blocks_.emplace(std::make_pair(token_hash_key, new_node));
 
       num_blocks_++;
-
-      insert_keys->emplace_back(token_hash_key.data);
     }
   }
 
diff --git a/xllm/core/framework/prefix_cache/prefix_cache.h b/xllm/core/framework/prefix_cache/prefix_cache.h
@@ -101,8 +101,6 @@ class PrefixCache {
   size_t insert(const Slice<int32_t>& token_ids,
                 std::vector<Block>& blocks,
                 std::vector<Murmur3Key>* insert_keys);
-  size_t insert(const std::vector<Block>& blocks,
-                std::vector<Murmur3Key>* insert_keys);
   size_t evict(size_t n_blocks, std::vector<Murmur3Key>* evict_keys);
 
   struct Node {
diff --git a/xllm/core/platform/npu/npu_layer_synchronizer.cpp b/xllm/core/platform/npu/npu_layer_synchronizer.cpp
@@ -27,7 +27,7 @@ NPULayerSynchronizerImpl::NPULayerSynchronizerImpl(const int64_t num_layers,
   uint32_t flags = ACL_EVENT_SYNC;
   for (int64_t i = 0; i < num_layers; ++i) {
     auto ret = aclrtCreateEventWithFlag(&events_[i], flags);
-    CHECK(ret == ACL_SUCCESS) << "Create event failed.";
+    CHECK(ret == ACL_SUCCESS) << "Create event failed:" << ret;
   }
 }
 
diff --git a/xllm/core/runtime/params_utils.cpp b/xllm/core/runtime/params_utils.cpp
@@ -754,7 +754,8 @@ bool block_transfer_info_to_proto(
     pb_cache.set_dst_block_id(info.dst_block_id);
     pb_cache.set_hash_key(info.hash_key, MURMUR_HASH3_VALUE_LEN);
 
-    *pb_block_transfer_info->mutable_transfer_infos()->Add() = pb_cache;
+    *pb_block_transfer_info->mutable_transfer_infos()->Add() =
+        std::move(pb_cache);
   }
   pb_block_transfer_info->set_batch_id(batch_id);
   pb_block_transfer_info->set_transfer_type(proto::TransferType(transfer_type));
diff --git a/xllm/core/runtime/worker_impl.cpp b/xllm/core/runtime/worker_impl.cpp
diff --git a/xllm/core/runtime/worker_impl.h b/xllm/core/runtime/worker_impl.h
diff --git a/xllm/models/llm/glm4_moe_mtp.h b/xllm/models/llm/glm4_moe_mtp.h
diff --git a/xllm/models/llm/qwen3.h b/xllm/models/llm/qwen3.h

Original file line number	Diff line number	Diff line change
`@@ -351,11 +351,7 @@ class ClientStreamReceiver : public brpc::StreamInputHandler {`
`351`	`351`
`352`	`352`	`~ClientStreamReceiver() {`
`353`	`353`	`if (!promise_set_.exchange(true)) {`
`354`		`- try {`
`355`		`- close_promise_.set_value();`
`356`		`- } catch (const std::exception& e) {`
`357`		`- LOG(WARNING) << "Exception in destructor: " << e.what();`
`358`		`- }`
	`354`	`+ close_promise_.set_value();`
`359`	`355`	`}`
`360`	`356`	`}`
`361`	`357`
Original file line number	Diff line number	Diff line change
`@@ -442,11 +442,7 @@ class ServerStreamHandler : public brpc::StreamInputHandler {`
`442`	`442`	`public:`
`443`	`443`	`~ServerStreamHandler() {`
`444`	`444`	`if (!promise_set_.exchange(true)) {`
`445`		`- try {`
`446`		`- close_promise_.set_value();`
`447`		`- } catch (const std::exception& e) {`
`448`		`- LOG(WARNING) << "Exception in destructor: " << e.what();`
`449`		`- }`
	`445`	`+ close_promise_.set_value();`
`450`	`446`	`}`
`451`	`447`	`}`
`452`	`448`
Original file line number	Diff line number	Diff line change
`@@ -70,9 +70,9 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {`
`70`	`70`	`for (const auto& block : blocks) {`
`71`	`71`	`// the block is not shared by other sequence`
`72`	`72`	`if (block.is_valid() && block.ref_count() <= 2) {`
`73`		`- if (num_used_blocks_ > 0) {`
`74`		`- num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);`
`75`		`- } else {`
	`73`	`+ auto origin_num_used_blocks =`
	`74`	`+ num_used_blocks_.fetch_sub(1, std::memory_order_relaxed);`
	`75`	`+ if (origin_num_used_blocks < 0) {`
`76`	`76`	`LOG(ERROR) << "num_used_blocks_==0 cannot fetch_sub for id:"`
`77`	`77`	`<< block.id()`
`78`	`78`	`<< ", total block size: " << num_total_blocks();`
`@@ -84,7 +84,7 @@ void BlockManagerImpl::deallocate(const Slice<Block>& blocks) {`
`84`	`84`	`error_msg.append(std::to_string(id)).append(" ");`
`85`	`85`	`}`
`86`	`86`	`}`
`87`		`- LOG(ERROR) << error_msg;`
	`87`	`+ LOG(FATAL) << error_msg;`
`88`	`88`	`}`
`89`	`89`	`}`
`90`	`90`	`}`
Original file line number	Diff line number	Diff line change
`@@ -156,13 +156,9 @@ void BlockManagerPool::set_offload_callback(`
`156`	`156`	`device_block_mgr_ptr = block_managers_[i].get()](`
`157`	`157`	`std::vector<folly::Try<uint32_t>>&& results) {`
`158`	`158`	`for (auto&& result : results) {`
`159`		`- try {`
`160`		`- if (result.value() != host_blocks.size()) {`
`161`		`- LOG(FATAL) << "Offload copy fail, expected "`
`162`		`- << host_blocks.size() << ", got " << result.value();`
`163`		`- }`
`164`		`- } catch (const std::exception& e) {`
`165`		`- LOG(FATAL) << "Offload copy fail! Exception caught: " << e.what();`
	`159`	`+ if (result.value() != host_blocks.size()) {`
	`160`	`+ LOG(FATAL) << "Offload copy fail, expected " << host_blocks.size()`
	`161`	`+ << ", got " << result.value();`
`166`	`162`	`}`
`167`	`163`	`}`
`168`	`164`	`host_block_mgr_ptr->cache(host_blocks);`
`@@ -212,6 +208,8 @@ bool BlockManagerPool::allocate(Sequence* sequence, size_t num_tokens) {`
`212`	`208`	`allocate_shared(sequence);`
`213`	`209`	`if (sequence->host_kv_state().num_kv_blocks() == 0) {`
`214`	`210`	`allocate_host_shared(sequence);`
	`211`	`+ if (sequence->kv_state().shared_kv_blocks_num() <`
	`212`	`+ sequence->host_kv_state().shared_kv_blocks_num())`
`215`	`213`	`}`
`216`	`214`	`}`
`217`	`215`