Merge branch 'main' into feature/rec_251126_pr_v2

DragonFive · web-flow · commit 507e7e9a0b19 · 2025-11-28T09:48:14.000+08:00
diff --git a/xllm/api_service/api_service.cpp b/xllm/api_service/api_service.cpp
@@ -139,7 +139,8 @@ void CommonCompletionsImpl(std::unique_ptr<Service>& service,
     return;
   }
 
-  auto call = std::make_shared<Call>(ctrl, guard.release(), req_pb, resp_pb);
+  auto call = std::make_shared<Call>(
+      ctrl, guard.release(), req_pb, resp_pb, arena != nullptr);
   service->process_async(call);
 }
 }  // namespace
diff --git a/xllm/core/common/global_flags.cpp b/xllm/core/common/global_flags.cpp
@@ -135,7 +135,7 @@ DEFINE_double(max_memory_utilization,
 
 DEFINE_int32(max_tokens_per_batch, 20480, "Max number of tokens per batch.");
 
-DEFINE_int32(max_seqs_per_batch, 256, "Max number of sequences per batch.");
+DEFINE_int32(max_seqs_per_batch, 1024, "Max number of sequences per batch.");
 
 DEFINE_bool(enable_schedule_overlap,
             true,
@@ -172,7 +172,7 @@ DEFINE_int32(ep_size, 1, "Expert parallel size for MoE model.");
 
 DEFINE_string(
     communication_backend,
-    "lccl",
+    "hccl",
     "NPU communication backend.(e.g. lccl, hccl). When enable dp, use hccl.");
 
 // --- ep load balance config ---
diff --git a/xllm/core/distributed_runtime/worker_service.cpp b/xllm/core/distributed_runtime/worker_service.cpp
@@ -136,24 +136,15 @@ void WorkerService::step(ForwardInput& fwd_input,
       }
     }
   } else {
+    auto int_options = torch::TensorOptions().device(torch::kCPU);
     if (worker_->is_driver()) {
       // construct fake output tensor
-      auto options =
-          torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
-      auto total_prefill_seq_len = 0;
-      auto total_num_sequences = 0;
-
-      total_num_sequences += fwd_input.input_params.num_sequences;
-      total_prefill_seq_len += fwd_input.input_params.prefill_seq_len;
-
-      next_tokens =
-          torch::arange(-1,
-                        -1 * (total_num_sequences - total_prefill_seq_len + 1),
-                        -1,
-                        options);
+      int32_t num_decode_seqs = fwd_input.sampling_params.sample_idxes.size(0);
+      next_tokens = torch::arange(
+          -1, -1 * (num_decode_seqs + 1), -1, int_options.dtype(torch::kInt32));
       std::move(future).deferValue([](auto&&) {});
     }
-    expert_load_data = torch::zeros({1, 1}).to(torch::kInt64).contiguous();
+    expert_load_data = torch::zeros({1, 1}, int_options.dtype(torch::kInt64));
   }
 }
 
diff --git a/xllm/core/framework/batch/batch.cpp b/xllm/core/framework/batch/batch.cpp
@@ -196,9 +196,7 @@ std::map<uint32_t, uint32_t> Batch::cal_seq_exchange_index(
   return index_shift;
 }
 
-RawForwardInput Batch::prepare_forward_input(uint32_t start_idx,
-                                             uint32_t end_idx,
-                                             const ModelArgs& args,
+RawForwardInput Batch::prepare_forward_input(const ModelArgs& args,
                                              ThreadPool* thread_pool) {
   dp_balance_shuffle_seqs();
   BatchInputBuilder builder(sequences_,
@@ -210,7 +208,7 @@ RawForwardInput Batch::prepare_forward_input(uint32_t start_idx,
                             &args,
                             batch_forward_type_,
                             thread_pool);
-  return builder.build_raw_forward_input(start_idx, end_idx);
+  return builder.build_raw_forward_input();
 }
 
 void Batch::process_sample_output(const RawForwardOutput& raw_output,
@@ -341,7 +339,7 @@ void Batch::append_token_for_sequence(Sequence* seq,
         seq->pre_scheduled_step_prefill_queue().pop();
       }
     }
-  } else {
+  } else if (!seq->cancelled()) {
     // truely update the real token if replace_fake_token
     seq->update_last_step_token(token, token_idx);
     if (FLAGS_enable_chunked_prefill && token_idx == 0) {
diff --git a/xllm/core/framework/batch/batch.h b/xllm/core/framework/batch/batch.h
@@ -85,9 +85,7 @@ class Batch {
                                      const ModelArgs& args);
 
   // Convert Batch to pb type, which will be pass to remote worker.
-  RawForwardInput prepare_forward_input(uint32_t start_idx,
-                                        uint32_t end_idx,
-                                        const ModelArgs& args,
+  RawForwardInput prepare_forward_input(const ModelArgs& args,
                                         ThreadPool* thread_pool);
 
   // process output
diff --git a/xllm/core/framework/batch/batch_input_builder.cpp b/xllm/core/framework/batch/batch_input_builder.cpp
@@ -53,7 +53,7 @@ BatchInputBuilder::BatchInputBuilder(
       mm_data_vec_(mm_data_vec),
       args_(args),
       thread_pool_(thread_pool),
-      num_sequences_(static_cast<int32_t>(sequences.size())),
+      num_sequences_(sequences.size()),
       swap_block_transfer_infos_(swap_block_transfer_infos),
       batch_id_(batch_id) {
   // Reserve space for better performance
@@ -72,35 +72,31 @@ BatchInputBuilder::BatchInputBuilder(
 ForwardInput BatchInputBuilder::build_forward_input(
     uint32_t num_decoding_tokens,
     uint32_t min_decoding_batch_size) {
-  process_sequences(0, static_cast<uint32_t>(num_sequences_));
+  process_sequences();
   padding_decode_batch_size(num_decoding_tokens, min_decoding_batch_size);
 
   return state_to_forward_input();
 }
 
-RawForwardInput BatchInputBuilder::build_raw_forward_input(uint32_t start_idx,
-                                                           uint32_t end_idx) {
-  if (!thread_pool_ ||
-      end_idx - start_idx < static_cast<uint32_t>(thread_pool_->size())) {
-    process_sequences(start_idx, end_idx);
+RawForwardInput BatchInputBuilder::build_raw_forward_input() {
+  if (!thread_pool_ || num_sequences_ < thread_pool_->size()) {
+    process_sequences();
   } else {
-    process_sequences_multithreaded(start_idx, end_idx);
+    process_sequences_multithreaded();
   }
   return state_to_raw_forward_input();
 }
 
-void BatchInputBuilder::process_sequences(uint32_t start_idx,
-                                          uint32_t end_idx) {
-  for (int32_t i = start_idx; i < end_idx; ++i) {
+void BatchInputBuilder::process_sequences() {
+  for (int32_t i = 0; i < num_sequences_; ++i) {
     process_single_sequence(i);
   }
 }
 
-void BatchInputBuilder::process_sequences_multithreaded(uint32_t start_idx,
-                                                        uint32_t end_idx) {
+void BatchInputBuilder::process_sequences_multithreaded() {
   const size_t threads_num = thread_pool_->size();
   const size_t sequences_per_thread =
-      (end_idx - start_idx + threads_num - 1) / threads_num;
+      (num_sequences_ + threads_num - 1) / threads_num;
 
   BlockingCounter counter(threads_num);
 
@@ -117,17 +113,17 @@ void BatchInputBuilder::process_sequences_multithreaded(uint32_t start_idx,
           BuilderState& state,
           std::unordered_set<int32_t>& write_block_ids) {
         for (size_t i = thread_start_idx;
-             i < thread_end_idx && i < static_cast<size_t>(end_idx);
+             i < thread_end_idx && i < static_cast<size_t>(num_sequences_);
              ++i) {
           process_single_sequence(i, &state, &write_block_ids);
         }
       };
 
   // Start parallel tasks
   for (size_t thread_idx = 0; thread_idx < threads_num; ++thread_idx) {
-    size_t thread_start_idx = start_idx + thread_idx * sequences_per_thread;
+    size_t thread_start_idx = thread_idx * sequences_per_thread;
     size_t thread_end_idx = std::min(thread_start_idx + sequences_per_thread,
-                                     static_cast<size_t>(end_idx));
+                                     static_cast<size_t>(num_sequences_));
 
     thread_pool_->schedule([process_sequences_range,
                             thread_start_idx,
@@ -214,7 +210,6 @@ void BatchInputBuilder::process_sequences_multithreaded(uint32_t start_idx,
     state_.new_token_slot_ids.insert(state_.new_token_slot_ids.end(),
                                      state.new_token_slot_ids.begin(),
                                      state.new_token_slot_ids.end());
-    state_.prefill_seq_len += state.prefill_seq_len;
     state_.embedding_ids.insert(state_.embedding_ids.end(),
                                 state.embedding_ids.begin(),
                                 state.embedding_ids.end());
@@ -306,11 +301,6 @@ void BatchInputBuilder::process_single_sequence(
         sequence, n_kv_cache_tokens, seq_len, q_seq_len, state_ptr);
   }
 
-  // Track prefill sequences
-  if (sequence->is_chunked_prefill_stage()) {
-    state.prefill_seq_len++;
-  }
-
   // Input for beam search kernel
   if (FLAGS_enable_beam_search_kernel && sequence->check_beam_search() &&
       sequence->num_generated_tokens() > 0) {
@@ -658,7 +648,6 @@ RawForwardInput BatchInputBuilder::state_to_raw_forward_input() {
   raw_forward_input.num_sequences = num_sequences_;
   // raw_forward_input.dp_global_token_nums = ;
   raw_forward_input.transfer_kv_infos = std::move(state_.transfer_kv_infos);
-  raw_forward_input.prefill_seq_len = state_.prefill_seq_len;
 
   // for flashinfer
   raw_forward_input.paged_kv_indptr = std::move(state_.paged_kv_indptr);
diff --git a/xllm/core/framework/batch/batch_input_builder.h b/xllm/core/framework/batch/batch_input_builder.h
@@ -47,12 +47,12 @@ class BatchInputBuilder {
   ForwardInput build_forward_input(uint32_t num_decoding_tokens,
                                    uint32_t min_decoding_batch_size);
 
-  RawForwardInput build_raw_forward_input(uint32_t start_idx, uint32_t end_idx);
+  RawForwardInput build_raw_forward_input();
 
  private:
   // Core building methods
-  void process_sequences(uint32_t start_idx, uint32_t end_idx);
-  void process_sequences_multithreaded(uint32_t start_idx, uint32_t end_idx);
+  void process_sequences();
+  void process_sequences_multithreaded();
   void padding_decode_batch_size(uint32_t num_decoding_tokens,
                                  uint32_t min_decoding_batch_size);
   ForwardInput state_to_forward_input();
@@ -100,7 +100,6 @@ class BatchInputBuilder {
     // Additional data
     std::vector<int32_t> embedding_ids;
     std::vector<int32_t> extra_token_ids;
-    uint32_t prefill_seq_len = 0;
     std::vector<TransferKVInfo> transfer_kv_infos;
 
     // for continuous kvcache
@@ -153,7 +152,7 @@ class BatchInputBuilder {
 
   // Configuration
   bool use_mrope_ = false;
-  int32_t num_sequences_ = 0;
+  uint32_t num_sequences_ = 0;
 
   // copy in and out cache contents
   std::unordered_set<int32_t> write_block_ids_;
diff --git a/xllm/core/framework/model/model_input_params.h b/xllm/core/framework/model/model_input_params.h
@@ -110,7 +110,6 @@ struct ModelInputParams {
 
     params.mm_data = MMData::to(mm_data, device);
     params.dp_global_token_nums = dp_global_token_nums;
-    params.prefill_seq_len = prefill_seq_len;
     params.embedding_ids = std::move(embedding_ids);
     params.extra_token_ids = std::move(extra_token_ids);
     params.dp_ep_padding_data = dp_ep_padding_data;
@@ -151,8 +150,7 @@ struct ModelInputParams {
               << " , global_empty_kv_cache is " << global_empty_kv_cache
               << " , num_sequences is " << num_sequences
               << " , kv_max_seq_len is " << kv_max_seq_len
-              << " , q_max_seq_len is " << q_max_seq_len
-              << " , prefill_seq_len is " << prefill_seq_len;
+              << " , q_max_seq_len is " << q_max_seq_len;
     LOG(INFO) << "ModelInputParams: kv_seq_lens_vec is " << kv_seq_lens_vec;
     LOG(INFO) << "ModelInputParams: q_seq_lens_vec is " << q_seq_lens_vec;
     LOG(INFO) << "ModelInputParams: decode_seq_range is " << decode_seq_range;
@@ -209,9 +207,6 @@ struct ModelInputParams {
   // whether the kv-cache is empty for all sequences,mainly used for dp case
   bool global_empty_kv_cache = true;
 
-  // num of prefill sequence in chunked prefill case
-  uint32_t prefill_seq_len = 0;
-
   // embedding ids of each sequence
   std::vector<int32_t> embedding_ids;
 
diff --git a/xllm/core/framework/request/request.cpp b/xllm/core/framework/request/request.cpp
@@ -125,6 +125,13 @@ size_t Request::total_num_blocks() {
   return num;
 }
 
+void Request::set_cancel() {
+  cancelled_.store(true, std::memory_order_relaxed);
+  for (const auto& seq : sequences()) {
+    seq->set_cancel();
+  }
+}
+
 RequestOutput Request::generate_output(const Tokenizer& tokenizer,
                                        ThreadPool* thread_pool) {
   // summarize statistics for all sequences
@@ -159,7 +166,7 @@ void Request::update_connection_status() {
   if (!is_disconnected) {
     return;
   }
-  cancelled_.store(true, std::memory_order_relaxed);
+  set_cancel();
 }
 
 }  // namespace xllm
diff --git a/xllm/core/framework/request/request.h b/xllm/core/framework/request/request.h
@@ -55,7 +55,7 @@ class Request : public RequestBase {
 
   SequencesGroup* sequence_group() { return sequences_group_.get(); }
 
-  void set_cancel() { cancelled_.store(true, std::memory_order_relaxed); }
+  void set_cancel();
 
   bool cancelled() const { return cancelled_.load(std::memory_order_relaxed); }
 
diff --git a/xllm/core/framework/request/sequence.h b/xllm/core/framework/request/sequence.h
@@ -263,6 +263,10 @@ class Sequence final {
   // get sequence id
   int32_t seq_id() const { return seq_id_; }
 
+  void set_cancel() { cancelled_.store(true, std::memory_order_relaxed); }
+
+  bool cancelled() const { return cancelled_.load(std::memory_order_relaxed); }
+
  private:
   // the index of the sequence in the request
   size_t index_ = 0;
@@ -352,6 +356,8 @@ class Sequence final {
   // 2 valid elements at most, maximum 2 steps pre scheduled.
   std::queue<bool> is_pre_scheduled_step_prefill_;
 
+  std::atomic<bool> cancelled_{false};
+
   // kvcache store copy async result
   std::atomic<bool> termination_flag_{false};
   std::vector<std::shared_ptr<std::atomic<uint32_t>>> prefetch_results_;
diff --git a/xllm/core/layers/npu/npu_deepseek_v2_decoder_layer_impl.cpp b/xllm/core/layers/npu/npu_deepseek_v2_decoder_layer_impl.cpp
@@ -1540,7 +1540,12 @@ torch::Tensor NpuDeepseekV2DecoderLayerImpl::forward(
     LOG_IF(FATAL, st != 0) << model_name_
                            << "excute prefill layer fail, error code: " << st;
   } else {
-    if (!FLAGS_enable_customize_mla_kernel) {
+    const int num_tokens = x.sizes().at(0);
+    // decode phase with tokens more than this limit will lead to error in
+    // customize mla kernel. once detect any input exceed the limit, fall back
+    // to default kernel.
+    const int num_tokens_limit = 230;
+    if (!FLAGS_enable_customize_mla_kernel || num_tokens >= num_tokens_limit) {
       build_node_variant_pack(decode_node_,
                               x,
                               cos_pos,
diff --git a/xllm/core/runtime/forward_params.h b/xllm/core/runtime/forward_params.h
@@ -165,8 +165,6 @@ struct RawForwardInput {
   // chunked prefill case of speculative decoding
   // extra token ids for each sequence, and -1 for last chunk
   std::vector<int32_t> extra_token_ids;
-  // num of prefill sequence in chunked prefill case
-  uint32_t prefill_seq_len;
   // embedding ids of each sequence
   std::vector<int> embedding_ids;
   // swap
diff --git a/xllm/core/runtime/forward_shared_memory_manager.cpp b/xllm/core/runtime/forward_shared_memory_manager.cpp
@@ -149,11 +149,10 @@ INLINE size_t calculate_raw_forward_input_size(const RawForwardInput& input) {
   total += type_size<uint64_t> +
            input.swap_blocks.size() * swap_block_info_fixed_size();
 
-  total += type_size<bool> * 2   // empty_kv_cache + global_empty_kv_cache
-           + type_size<int32_t>  // batch_forward_type
-           + type_size<uint32_t> *
-                 3  // max_seq_len + q_max_seq_len + prefill_seq_len
-           + type_size<int32_t>  // num_sequences
+  total += type_size<bool> * 2        // empty_kv_cache + global_empty_kv_cache
+           + type_size<int32_t>       // batch_forward_type
+           + type_size<uint32_t> * 2  // max_seq_len + q_max_seq_len
+           + type_size<int32_t>       // num_sequences
            + get_eplb_info_size(input.eplb_info);
   // m_position
   total += get_2d_vector_size(input.m_positions_vec);
@@ -577,7 +576,6 @@ INLINE void deserialize_raw_forward_input(
   read_data(buffer, input.q_max_seq_len);
   read_data(buffer, input.num_sequences);
   read_eplb_info(buffer, input.eplb_info);
-  read_data(buffer, input.prefill_seq_len);
   read_2d_vector(buffer, input.m_positions_vec);
   read_mm_data(buffer, input.mm_data);
 }
@@ -630,7 +628,6 @@ INLINE void serialize_raw_forward_input(const RawForwardInput& input,
   write_data(buffer, input.q_max_seq_len);
   write_data(buffer, input.num_sequences);
   write_eplb_info(buffer, input.eplb_info);
-  write_data(buffer, input.prefill_seq_len);
   write_2d_vector(buffer, input.m_positions_vec);
   write_mm_data(buffer, input.mm_data);
 }
@@ -832,7 +829,6 @@ void convert_raw_forward_input_to_forward_input(RawForwardInput& raw_input,
   input_params.num_sequences = raw_input.num_sequences;
   input_params.kv_max_seq_len = raw_input.max_seq_len;
   input_params.q_max_seq_len = raw_input.q_max_seq_len;
-  input_params.prefill_seq_len = raw_input.prefill_seq_len;
   input_params.embedding_ids = std::move(raw_input.embedding_ids);
   input_params.dp_global_token_nums = std::move(raw_input.dp_global_token_nums);
 
diff --git a/xllm/core/runtime/llm_engine.cpp b/xllm/core/runtime/llm_engine.cpp
diff --git a/xllm/core/runtime/params_utils.cpp b/xllm/core/runtime/params_utils.cpp
diff --git a/xllm/core/runtime/vlm_engine.cpp b/xllm/core/runtime/vlm_engine.cpp
diff --git a/xllm/core/util/utils.cpp b/xllm/core/util/utils.cpp
diff --git a/xllm/models/vlm/qwen3_vl.h b/xllm/models/vlm/qwen3_vl.h
diff --git a/xllm/proto/worker.proto b/xllm/proto/worker.proto

Original file line number	Diff line number	Diff line change
`@@ -139,7 +139,8 @@ void CommonCompletionsImpl(std::unique_ptr<Service>& service,`
`139`	`139`	`return;`
`140`	`140`	`}`
`141`	`141`
`142`		`- auto call = std::make_shared<Call>(ctrl, guard.release(), req_pb, resp_pb);`
	`142`	`+ auto call = std::make_shared<Call>(`
	`143`	`+ ctrl, guard.release(), req_pb, resp_pb, arena != nullptr);`
`143`	`144`	`service->process_async(call);`
`144`	`145`	`}`
`145`	`146`	`} // namespace`
Original file line number	Diff line number	Diff line change
`@@ -136,24 +136,15 @@ void WorkerService::step(ForwardInput& fwd_input,`
`136`	`136`	`}`
`137`	`137`	`}`
`138`	`138`	`} else {`
	`139`	`+ auto int_options = torch::TensorOptions().device(torch::kCPU);`
`139`	`140`	`if (worker_->is_driver()) {`
`140`	`141`	`// construct fake output tensor`
`141`		`- auto options =`
`142`		`- torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);`
`143`		`- auto total_prefill_seq_len = 0;`
`144`		`- auto total_num_sequences = 0;`
`145`		`-`
`146`		`- total_num_sequences += fwd_input.input_params.num_sequences;`
`147`		`- total_prefill_seq_len += fwd_input.input_params.prefill_seq_len;`
`148`		`-`
`149`		`- next_tokens =`
`150`		`- torch::arange(-1,`
`151`		`- -1 * (total_num_sequences - total_prefill_seq_len + 1),`
`152`		`- -1,`
`153`		`- options);`
	`142`	`+ int32_t num_decode_seqs = fwd_input.sampling_params.sample_idxes.size(0);`
	`143`	`+ next_tokens = torch::arange(`
	`144`	`+ -1, -1 * (num_decode_seqs + 1), -1, int_options.dtype(torch::kInt32));`
`154`	`145`	`std::move(future).deferValue([](auto&&) {});`
`155`	`146`	`}`
`156`		`- expert_load_data = torch::zeros({1, 1}).to(torch::kInt64).contiguous();`
	`147`	`+ expert_load_data = torch::zeros({1, 1}, int_options.dtype(torch::kInt64));`
`157`	`148`	`}`
`158`	`149`	`}`
`159`	`150`