Skip to content

Commit 7a26c8e

Browse files
committed
fix undesired blocking in engine core to fix the missing parallel for
PP in mp backend. Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
1 parent 6fcf8fe commit 7a26c8e

File tree

2 files changed

+6
-12
lines changed

2 files changed

+6
-12
lines changed

vllm/v1/engine/core.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from contextlib import ExitStack, contextmanager
1212
from inspect import isclass, signature
1313
from logging import DEBUG
14-
from typing import Any, TypeVar, cast
14+
from typing import Any, TypeVar
1515

1616
import msgspec
1717
import zmq
@@ -405,21 +405,14 @@ def step_with_batch_queue(
405405
grammar_output = self.scheduler.get_grammar_bitmask(
406406
scheduler_output
407407
)
408-
# Block-wait for execute to return (continues running async on the GPU).
409-
with self.log_error_detail(scheduler_output):
410-
exec_result = exec_future.result()
411408

412-
if exec_result is None:
413409
with record_function_or_nullcontext(
414410
"core step_with_batch_queue: sample_tokens"
415411
):
416412
# Call sample tokens.
417413
future = self.model_executor.sample_tokens(
418414
grammar_output, non_block=True
419415
)
420-
else:
421-
# No sampling required (e.g. all requests finished).
422-
future = cast(Future[ModelRunnerOutput], exec_future)
423416
# Add this step's future to the queue.
424417
batch_queue.appendleft((future, scheduler_output))
425418
if (

vllm/v1/worker/gpu_model_runner.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4+
import copy
45
import gc
56
import itertools
67
import time
78
from collections import defaultdict
89
from collections.abc import Iterator
910
from contextlib import contextmanager
10-
from copy import deepcopy
1111
from functools import reduce
1212
from itertools import product
1313
from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast
@@ -2734,8 +2734,9 @@ def sample_tokens(
27342734
self, grammar_output: "GrammarOutput | None"
27352735
) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
27362736
if self.execute_model_state is None:
2737-
# Nothing to do (PP non-final rank case), output isn't used.
2738-
return None # noqa
2737+
return copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
2738+
# # Nothing to do (PP non-final rank case), output isn't used.
2739+
# return None # noqa
27392740

27402741
# Unpack ephemeral state.
27412742
(
@@ -4763,7 +4764,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
47634764
kv_cache_config: Configuration for the KV cache, including the KV
47644765
cache size of each layer
47654766
"""
4766-
kv_cache_config = deepcopy(kv_cache_config)
4767+
kv_cache_config = copy.deepcopy(kv_cache_config)
47674768
self.kv_cache_config = kv_cache_config
47684769
self.may_add_encoder_only_layers_to_kv_cache_config()
47694770
self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)

0 commit comments

Comments
 (0)