update batch info prefill

yuhao-zh · yuhao-zh · commit 5f8c45b90e5a · 2025-11-13T10:03:22.000+08:00
diff --git a/src/parallax/vllm/batch_info.py b/src/parallax/vllm/batch_info.py
@@ -90,6 +90,9 @@ def form_vllm_batch_prefill(
     new_request_data_list = []
     num_scheduled_tokens: Dict[str, int] = {}
     total_tokens = 0
+    
+    # Check if this is a non-first peer (IntermediateRequest with hidden_states)
+    is_first_peer = model_runner.is_first_peer if hasattr(model_runner, "is_first_peer") else True
 
     for req in batched_requests:
         sampling_params = transform_sampling_params_to_vllm(req.sampling_params)
@@ -99,7 +102,14 @@ def form_vllm_batch_prefill(
 
         computed_blocks, num_computed_tokens = kv_cache_manager.get_computed_blocks(vllm_req)
 
-        prompt_token_ids = getattr(req, "input_ids", None) or []
+        # For non-first peers, use hidden_states shape instead of input_ids length
+        if not is_first_peer and hasattr(req, "hidden_states") and req.hidden_states is not None:
+            # hidden_states shape: (num_tokens, hidden_size)
+            num_tokens = req.hidden_states.shape[0]
+            prompt_token_ids = req.input_ids[:num_tokens] if req.input_ids else list(range(num_tokens))
+        else:
+            prompt_token_ids = getattr(req, "input_ids", None) or []
+            
         num_new_tokens = max(len(prompt_token_ids) - num_computed_tokens, 0)
         if num_new_tokens > 0:
             new_blocks = kv_cache_manager.allocate_slots(
@@ -123,7 +133,7 @@ def form_vllm_batch_prefill(
 
         new_req_data = NewRequestData(
             req_id=req.request_id,
-            prompt_token_ids=req.input_ids,
+            prompt_token_ids=prompt_token_ids,
             mm_features=[],
             sampling_params=sampling_params,
             pooling_params=None,