fix undesired blocking in engine core to fix the missing parallel for

weireweire · weireweire · commit 7a26c8e4003e · 2025-11-14T02:34:59.000Z
PP in mp backend.

Signed-off-by: Weiliang Liu &lt;weiliangl@nvidia.com&gt;
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -11,7 +11,7 @@
 from contextlib import ExitStack, contextmanager
 from inspect import isclass, signature
 from logging import DEBUG
-from typing import Any, TypeVar, cast
+from typing import Any, TypeVar
 
 import msgspec
 import zmq
@@ -405,21 +405,14 @@ def step_with_batch_queue(
                     grammar_output = self.scheduler.get_grammar_bitmask(
                         scheduler_output
                     )
-                # Block-wait for execute to return (continues running async on the GPU).
-                with self.log_error_detail(scheduler_output):
-                    exec_result = exec_future.result()
 
-                if exec_result is None:
                     with record_function_or_nullcontext(
                         "core step_with_batch_queue: sample_tokens"
                     ):
                         # Call sample tokens.
                         future = self.model_executor.sample_tokens(
                             grammar_output, non_block=True
                         )
-                else:
-                    # No sampling required (e.g. all requests finished).
-                    future = cast(Future[ModelRunnerOutput], exec_future)
                 # Add this step's future to the queue.
                 batch_queue.appendleft((future, scheduler_output))
                 if (
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import copy
 import gc
 import itertools
 import time
 from collections import defaultdict
 from collections.abc import Iterator
 from contextlib import contextmanager
-from copy import deepcopy
 from functools import reduce
 from itertools import product
 from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast
@@ -2734,8 +2734,9 @@ def sample_tokens(
         self, grammar_output: "GrammarOutput | None"
     ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
         if self.execute_model_state is None:
-            # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # noqa
+            return copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
+            # # Nothing to do (PP non-final rank case), output isn't used.
+            # return None  # noqa
 
         # Unpack ephemeral state.
         (
@@ -4763,7 +4764,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
-        kv_cache_config = deepcopy(kv_cache_config)
+        kv_cache_config = copy.deepcopy(kv_cache_config)
         self.kv_cache_config = kv_cache_config
         self.may_add_encoder_only_layers_to_kv_cache_config()
         self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)